[Cryptech-Commits] [core/math/modexpa7] branch systolic updated: Finished modulus-dependent coefficient calculation module: * fixed bug with latency compensation * cleaned up Verilog source * added 512-bit testbench * works in simulator * synthesizes without warnings

git at cryptech.is git at cryptech.is
Fri Jun 30 23:10:27 UTC 2017


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch systolic
in repository core/math/modexpa7.

The following commit(s) were added to refs/heads/systolic by this push:
     new 1fd8037  Finished modulus-dependent coefficient calculation module:  * fixed bug with latency compensation  * cleaned up Verilog source  * added 512-bit testbench  * works in simulator  * synthesizes without warnings
1fd8037 is described below

commit 1fd8037d41be46d24b3610c89f781fe85def4317
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Sat Jul 1 02:05:02 2017 +0300

    Finished modulus-dependent coefficient calculation module:
     * fixed bug with latency compensation
     * cleaned up Verilog source
     * added 512-bit testbench
     * works in simulator
     * synthesizes without warnings
    
    Changes:
     * made latency of generic processing element configurable
---
 src/rtl/modexpa7_factor.v       |  57 ---
 src/rtl/modexpa7_n_coeff.v      | 745 +++++++++++++++++++++++++++++-----------
 src/rtl/pe/modexpa7_pe_mul.v    |  41 +--
 src/tb/tb_factor.v              |   2 +-
 src/tb/tb_n_coeff.v             | 235 ++++++++++---
 src/tb/tb_systolic_multiplier.v |   2 +-
 6 files changed, 739 insertions(+), 343 deletions(-)

diff --git a/src/rtl/modexpa7_factor.v b/src/rtl/modexpa7_factor.v
index 17d4785..510f7af 100644
--- a/src/rtl/modexpa7_factor.v
+++ b/src/rtl/modexpa7_factor.v
@@ -118,63 +118,6 @@ module modexpa7_factor #
 	localparam	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
 	wire			[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last = n_num_words_latch;
 	
-	
-		//
-		// BRAM Addresses
-		//
-		/*
-	reg	[OPERAND_ADDR_WIDTH-1:0]	f_bram_addr_reg;
-		
-	wire	[OPERAND_ADDR_WIDTH-1:0]	f_bram_addr_next = f_bram_addr + 1'b1;
-	
-	wire										f_bram_addr_done =  (f_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
-	
-	assign f_bram_addr = f_bram_addr_reg;
-
-		
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-		
-			FSM_STATE_INIT_ZERO_ADDR:	f_bram_addr_reg <= bram_addr_zero;
-			FSM_STATE_INIT_NEXT_ADDR:	f_bram_addr_reg <= f_bram_addr_next;
-			
-		endcase
-
-	reg	f_bram_en;
-	
-	assign f_bram_wr = f_bram_en;
-	
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-			
-			FSM_STATE_INIT_ZERO_ADDR,
-			FSM_STATE_INIT_NEXT_ADDR,
-			FSM_STATE_INIT_LAST_ADDR:	f_bram_en <= 1'b1;
-			default:							f_bram_en <= 1'b0;
-			
-		endcase
-	
-	
-	reg	[31: 0]	f_bram_data;
-	
-	assign f_bram_in = f_bram_data;
-	
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-			FSM_STATE_INIT_ZERO_ADDR:	f_bram_data <= 32'd1;
-			FSM_STATE_INIT_NEXT_ADDR,
-			FSM_STATE_INIT_LAST_ADDR:	f_bram_data <= 32'd0;
-			default:							f_bram_data <= {32{1'bX}};
-			
-		endcase
-		*/
-		
-
-	
-	
 		//
 		// Cycle Counters
 		//
diff --git a/src/rtl/modexpa7_n_coeff.v b/src/rtl/modexpa7_n_coeff.v
index 1e763ba..cba59e2 100644
--- a/src/rtl/modexpa7_n_coeff.v
+++ b/src/rtl/modexpa7_n_coeff.v
@@ -40,28 +40,28 @@ module modexpa7_n_coeff #
 	(
 			//
 			// This sets the address widths of memory buffers. Internal data
-			// width is 32 bits, so for e.g. 1024-bit operands buffers must store
-			// 1024 / 32 = 32 words, and these need 5-bit address bus, because
-			// 2 ** 5 = 32.
+			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
+			// 2048 / 32 = 64 words, and these need 6-bit address bus, because
+			// 2 ** 6 = 64.
 			//
-		parameter	OPERAND_ADDR_WIDTH = 5
+		parameter	OPERAND_ADDR_WIDTH = 6
 	)
 	(
-		input											clk,
-		input											rst_n,
+		input											clk,						// clock
+		input											rst_n,					// active-low reset
 
-		input											ena,
-		output										rdy,
+		input											ena,						// enable input
+		output										rdy,						// ready output
 
-		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
-		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,			// modulus memory address
+		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,	// modulus coefficient memory address
 
-		input		[                32-1:0]	n_bram_out,
+		input		[                32-1:0]	n_bram_out,				// modulus memory output
 
-		output	[                32-1:0]	n_coeff_bram_in,
-		output										n_coeff_bram_wr,
+		output	[                32-1:0]	n_coeff_bram_in,		// modulus coefficient memory input
+		output										n_coeff_bram_wr,		// modulus coefficient memory write enable
 
-		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words
+		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words				// number of words in modulus
 	);
 	
 		//
@@ -79,191 +79,286 @@ module modexpa7_n_coeff #
 	localparam	[ 7: 0]	FSM_STATE_CALC_2	= 8'hB2;
 	localparam	[ 7: 0]	FSM_STATE_CALC_3	= 8'hB3;
 	localparam	[ 7: 0]	FSM_STATE_CALC_4	= 8'hB4;
-	/*
 	localparam	[ 7: 0]	FSM_STATE_CALC_5	= 8'hB5;
-	localparam	[ 7: 0]	FSM_STATE_CALC_6	= 8'hB6;
-	localparam	[ 7: 0]	FSM_STATE_CALC_7	= 8'hB7;
-	localparam	[ 7: 0]	FSM_STATE_CALC_8	= 8'hB8;
 	
 	localparam	[ 7: 0]	FSM_STATE_SAVE_1	= 8'hC1;
 	localparam	[ 7: 0]	FSM_STATE_SAVE_2	= 8'hC2;
 	localparam	[ 7: 0]	FSM_STATE_SAVE_3	= 8'hC3;
 	localparam	[ 7: 0]	FSM_STATE_SAVE_4	= 8'hC4;
 	localparam	[ 7: 0]	FSM_STATE_SAVE_5	= 8'hC5;
-	*/
+
 	localparam	[ 7: 0]	FSM_STATE_STOP		= 8'hFF;
 	
+	
+		//
+		// FSM State / Next State
+		//
 	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
 	reg	[ 7: 0]	fsm_next_state;
 
 
-		//
-		// Enable Delay (Trigger)
-		//
+		//
+		// Enable Delay and Trigger
+		//
    reg ena_dly = 1'b0;
-   wire ena_trig = ena && !ena_dly;
+	
+		/* delay enable by one clock cycle */
    always @(posedge clk) ena_dly <= ena;
+
+		/* trigger new operation when enable goes high */
+   wire ena_trig = ena && !ena_dly;
+	
 	
+		//
+		// Ready Flag Logic
+		//
+	reg rdy_reg = 1'b1;
+	assign rdy = rdy_reg;
+
+   always @(posedge clk or negedge rst_n)
+		
+			/* reset flag */
+		if (rst_n == 1'b0)						rdy_reg <= 1'b1;
+		else begin
+		
+				/* clear flag when operation is started */
+			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
+			
+				/* set flag after operation is finished */
+			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
+			
+		end
+		
 		
 		//
 		// Parameters Latch
 		//
 	reg	[OPERAND_ADDR_WIDTH-1:0]	n_num_words_latch;
 
+		/* save number of words in modulus when new operation starts*/
 	always @(posedge clk)
 		//
 		if (fsm_next_state == FSM_STATE_INIT_1)
 			n_num_words_latch <= n_num_words;
 
-	
-		//
-		// Addresses
-		//
-	localparam	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
-	wire			[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last = n_num_words_latch;
 		
-	
-	/*
 		//
 		// Cycle Counters
 		//
-	reg	[OPERAND_ADDR_WIDTH+5:0]	cyc_cnt;		// cycle counter
+	reg	[OPERAND_ADDR_WIDTH+4:0]	cyc_cnt;
 		
-	wire	[OPERAND_ADDR_WIDTH+5:0]	cyc_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}, {5{1'b0}}};
-	wire	[OPERAND_ADDR_WIDTH+5:0]	cyc_cnt_last = {n_num_words, 1'b1, {5{1'b1}}};
-	wire	[OPERAND_ADDR_WIDTH+5:0]	cyc_cnt_next = cyc_cnt + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH+4:0]	cyc_cnt_zero = {{OPERAND_ADDR_WIDTH{1'b0}}, {5{1'b0}}};
+	wire	[OPERAND_ADDR_WIDTH+4:0]	cyc_cnt_last = {n_num_words, 5'b11110};
+	wire	[OPERAND_ADDR_WIDTH+4:0]	cyc_cnt_next = cyc_cnt + 1'b1;
 
+		/* handy flag */
 	wire	cyc_cnt_done = (cyc_cnt == cyc_cnt_last) ? 1'b1 : 1'b0;
 
-	
 	always @(posedge clk)
 		//
 		if (fsm_next_state == FSM_STATE_CALC_1)
 			//
 			case (fsm_state)
-				FSM_STATE_INIT_2:	cyc_cnt <= cyc_cnt_zero;
-				FSM_STATE_SAVE_5:	cyc_cnt <= cyc_cnt_done ? cyc_cnt : cyc_cnt_next;
+				FSM_STATE_INIT_5:	cyc_cnt <= cyc_cnt_zero;
+				FSM_STATE_SAVE_5:	cyc_cnt <= !cyc_cnt_done ? cyc_cnt_next : cyc_cnt;
 			endcase
-		*/	
-
-	
-	
-		
-		//
-		// Ready Flag Logic
+			
+			
 		//
-	reg rdy_reg = 1'b1;
-	assign rdy = rdy_reg;
-
-   always @(posedge clk or negedge rst_n)
+		// Handy Address Values
 		//
-		if (rst_n == 1'b0)						rdy_reg <= 1'b1;
-		else begin
-			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
-			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;
-		end
+		
+		/* the very first address */
+	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
+	
+		/* the very last address */
+	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last = n_num_words_latch;
 		
 		
 		//
 		// Block Memories
 		//
+		
+		/*
+		 * This module uses 8 block memories:
+		 *
+		 * N       - external input, stores modulus
+		 * R       - internal, stores intermediate result
+		 * B       - internal, stores current bit mask (see high-level algorithm)
+		 * T       - internal, stores the product R * NN (see high-level algorithm)
+		 * NN      - internal, stores the quantity ~N + 1 (see high-level algorithm)
+		 * RR      - internal, stores a copy of R (see high-level algorithm)
+		 * RB      - internal, stores the sum R + B (see high-level algorithm)
+		 * N_COEFF - external output, stores the calculated modulus-depentent coefficient
+		 *
+		 */
+		
 	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr;
 	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;	
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	t_addr;
 	reg	[OPERAND_ADDR_WIDTH-1:0]	nn_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	t_addr_wr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	t_addr_rd;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	rr_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	rb_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr;
 	
 	reg	[31: 0]	r_data_in;
 	reg	[31: 0]	b_data_in;
-	reg	[31: 0]	nn_data_in;
 	reg	[31: 0]	t_data_in;
+	reg	[31: 0]	nn_data_in;
+	reg	[31: 0]	rr_data_in;
+	reg	[31: 0]	rb_data_in;
+	reg	[31: 0]	n_coeff_data_in;
 	
 	wire	[31: 0]	r_data_out;
 	wire	[31: 0]	b_data_out;
-	wire	[31: 0]	nn_data_out;
 	wire	[31: 0]	t_data_out;
+	wire	[31: 0]	nn_data_out;
+	wire	[31: 0]	rr_data_out;
+	wire	[31: 0]	rb_data_out;
 	
-	reg				r_wren;
-	reg				b_wren;
-	reg				nn_wren;
-	reg				t_wren;
+	reg	r_wren;
+	reg	b_wren;
+	reg	t_wren;
+	reg	nn_wren;
+	reg	rr_wren;
+	reg	rb_wren;
+	reg	n_coeff_wren;
 		
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_r (.clk(clk), .a_addr(r_addr), .a_wr(r_wren), .a_in(r_data_in), .a_out(r_data_out));
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_r (.clk(clk), .a_addr(r_addr), .a_wr(r_wren), .a_in(r_data_in), .a_out(r_data_out));
 
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
 	bram_b (.clk(clk), .a_addr(b_addr), .a_wr(b_wren), .a_in(b_data_in), .a_out(b_data_out));
 
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
 	bram_nn (.clk(clk), .a_addr(nn_addr), .a_wr(nn_wren), .a_in(nn_data_in), .a_out(nn_data_out));		
 
-	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_t (.clk(clk), .a_addr(t_addr_wr), .a_wr(t_wren), .a_in(t_data_in), .a_out(), .b_addr(t_addr_rd), .b_out(t_data_out));
-		
-	assign n_bram_addr = n_addr;
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_t (.clk(clk), .a_addr(t_addr), .a_wr(t_wren), .a_in(t_data_in), .a_out(t_data_out));
+
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_rb (.clk(clk), .a_addr(rb_addr), .a_wr(rb_wren), .a_in(rb_data_in), .a_out(rb_data_out));
+
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_rr (.clk(clk), .a_addr(rr_addr), .a_wr(rr_wren), .a_in(rr_data_in), .a_out(rr_data_out));
+			
+		/* handy values */
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next				= n_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next				= r_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next				= b_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	t_addr_next				= t_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	nn_addr_next			= nn_addr      + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	rr_addr_next			= rr_addr      + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	rb_addr_next			= rb_addr      + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_next		= n_coeff_addr + 1'b1;
 	
-	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next  = n_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next  = r_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next  = b_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	nn_addr_next = nn_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	t_addr_wr_next  = t_addr_wr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	t_addr_rd_next  = t_addr_rd + 1'b1;
+		/* handy flags */
+	wire	n_addr_done				= (n_addr       == bram_addr_last) ? 1'b1 : 1'b0;
+	wire	r_addr_done				= (r_addr       == bram_addr_last) ? 1'b1 : 1'b0;
+	wire	b_addr_done				= (b_addr       == bram_addr_last) ? 1'b1 : 1'b0;
+	wire	t_addr_done				= (t_addr       == bram_addr_last) ? 1'b1 : 1'b0;
+	wire	nn_addr_done			= (nn_addr      == bram_addr_last) ? 1'b1 : 1'b0;	
+	wire	rr_addr_done			= (rr_addr      == bram_addr_last) ? 1'b1 : 1'b0;
+	wire	rb_addr_done			= (rb_addr      == bram_addr_last) ? 1'b1 : 1'b0;
+	wire	n_coeff_addr_done		= (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0;
 	
-	wire										n_addr_done  = (n_addr  == bram_addr_last) ? 1'b1 : 1'b0;
-	wire										r_addr_done  = (r_addr  == bram_addr_last) ? 1'b1 : 1'b0;
-	wire										b_addr_done  = (b_addr  == bram_addr_last) ? 1'b1 : 1'b0;
-	wire										nn_addr_done = (nn_addr == bram_addr_last) ? 1'b1 : 1'b0;	
-	wire										t_addr_wr_done  = (t_addr_wr  == bram_addr_last) ? 1'b1 : 1'b0;	
-	wire										t_addr_rd_done  = (t_addr_rd  == bram_addr_last) ? 1'b1 : 1'b0;	
+		/* map top-level ports to internal register */
+	assign n_bram_addr			= n_addr;
+	assign n_coeff_bram_addr	= n_coeff_addr;
+	assign n_coeff_bram_in		= n_coeff_data_in;
+	assign n_coeff_bram_wr		= n_coeff_wren;
+
+
+		//
+		// Delayed Flags
+		//
+	reg	rb_addr_done_dly;
+	
+		/* delay rb_addr_done flag by one clock cycle (used later) */
+	always @(posedge clk) rb_addr_done_dly <= rb_addr_done;
 	
 	
 		//
-		// Subtractor
+		// Adder1
 		//
-	wire	[31: 0]	add_s;
-	wire				add_c_in;
-	reg				add_b_lsb;
-	reg				add_c_in_mask;
-	reg				add_c_in_mask_dly;
-	wire				add_c_out;
+		
+		/*
+		 * This adder is used to calculate NN = ~N + 1.
+		 *
+		 */
+	wire	[31: 0]	add1_s;					// sum output
+	wire				add1_c_in;				// carry input
+	reg				add1_b_lsb;				// B-input
+	reg				add1_c_in_mask;		// flag to not carry anything into the very first word
+	reg				add1_c_in_mask_dly;	// delayed carry masking flag
+	wire				add1_c_out;				// carry output
 	
-	assign add_c_in = add_c_out & ~add_c_in_mask;
+		/* add masking into carry feedback chain */
+	assign add1_c_in = add1_c_out & ~add1_c_in_mask;
 
-	always @(posedge clk)
-		//
-		add_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
-		
-	always @(posedge clk)
-		//
-		add_b_lsb <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+		/* feed 1 into port B of adder */
+	always @(posedge clk) add1_b_lsb <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+
+		/* mask carry for the very first word of N */
+	always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+
+		/* delay carry masking flag by one clock cycle (used later) */
+	always @(posedge clk) add1_c_in_mask_dly <= add1_c_in_mask;
+	
+	modexpa7_pe_add add1_inst
+	(
+		.clk		(clk),								//
+		.ce		(1'b1),
+		.a			(~n_bram_out),						// ~N
+		.b			({{31{1'b0}}, add1_b_lsb}),	//  1
+		.c_in		(add1_c_in),						//
+		.s			(add1_s),							//
+		.c_out	(add1_c_out)						//
+	);
+	
 	
-	always @(posedge clk)
 		//
-		add_c_in_mask_dly <= add_c_in_mask;
+		// Adder2
+		//
+		
+		/*
+		 * This adder is used to calculate RB = R + B.
+		 *
+		 */
+	wire	[31: 0]	add2_s;			// sum output
+	reg				add2_c_in;		// carry input
+	wire				add2_c_out;		// carry output
 			
-	ip_add32 add_inst
+	modexpa7_pe_add add2_inst
 	(
 		.clk		(clk),
-		.a			(~n_bram_out),
-		.b			({{31{1'b0}}, add_b_lsb}),
-		.c_in		(add_c_in),
-		.s			(add_s),
-		.c_out	(add_c_out)
+		.ce		(1'b1),
+		.a			(r_data_out),
+		.b			(b_data_in),
+		.c_in		(add2_c_in),
+		.s			(add2_s),
+		.c_out	(add2_c_out)
 	);
 
 
 		//
 		// Multiplier
 		//
+		
+		/*
+		 * This multiplier is used to calculate T = R * NN.
+		 *
+		 */
+		 
 	reg	[31: 0]	pe_a;
 	reg	[31: 0]	pe_b;
 	reg	[31: 0]	pe_t;
 	reg	[31: 0]	pe_c_in;
 	wire	[31: 0]	pe_p;
 	wire	[31: 0]	pe_c_out;
-	
-	modexpa7_pe_mul pe2
+		
+	modexpa7_pe_mul pe_mul_inst
 	(
 		.clk		(clk),
 		.a			(pe_a),
@@ -274,161 +369,413 @@ module modexpa7_n_coeff #
 		.c_out	(pe_c_out)
 	);
 
+
+		//
+		// Multiplier Latency Compensation Logic
+		//
 		
-	/*
+	localparam SYSTOLIC_PE_LATENCY = 4;
+	
+		/* shift register to match data propagation delay */
+	reg [SYSTOLIC_PE_LATENCY:0] pe_latency;
+	wire pe_latency_done = pe_latency[SYSTOLIC_PE_LATENCY];
+	
+		/* gradually fill the shift register with ones */
 	always @(posedge clk)
 		//
-		case (fsm_next_state)
-			FSM_STATE_CALC_2:		f0_data_out_carry <= 1'b0;
-			FSM_STATE_CALC_3,
-			FSM_STATE_CALC_4,
-			FSM_STATE_CALC_5,
-			FSM_STATE_CALC_6:		f0_data_out_carry <= f0_data_out[31];
-			default:					f0_data_out_carry <= 1'bX;
-		endcase
-	*/
+		if (fsm_state == FSM_STATE_CALC_1)
+				pe_latency <= {1'b0, {SYSTOLIC_PE_LATENCY{1'b0}}};
+		else	pe_latency <= {pe_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b1};
 
-	/*
-	reg	sub_b_out_dly1;
-	reg	f0_data_out_carry_dly1;
-	reg	f0_data_out_carry_dly2;
+
+		//
+		// Adder2 Output Delay
+		//
+	reg	[31: 0]	add2_s_dly[1:SYSTOLIC_PE_LATENCY-1];
+	reg				add2_c_out_dly[1:SYSTOLIC_PE_LATENCY+2];	
+
+		/* delay sum */
+	integer i;
+	always @(posedge clk)
+		//
+		for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
+			add2_s_dly[i] <= (i == 1) ? add2_s : add2_s_dly[i-1];
+		
+		/* delay adder carry */
+	always @(posedge clk)
+		//
+		for (i=1; i<=(SYSTOLIC_PE_LATENCY+2); i=i+1)
+			add2_c_out_dly[i] <= (i == 1) ? add2_c_out : add2_c_out_dly[i-1];
+
+		/* adder carry feedback */
+	always @(posedge clk)
+		//
+		if ((fsm_next_state == FSM_STATE_CALC_3) && (nn_addr == bram_addr_zero))
+			add2_c_in <= (r_addr == bram_addr_zero) ? 1'b0 : add2_c_out_dly[SYSTOLIC_PE_LATENCY+2];
+			
+		//
+		// Multiplier Output Delay
+		//
+	reg	[31: 0]	pe_c_out_dly[1:3];
+
+	always @(posedge clk)
+		//
+		for (i=1; i<=3; i=i+1)
+			pe_c_out_dly[i] <= (i == 1) ? pe_c_out : pe_c_out_dly[i-1];
+
+
+		//
+		// Multiplier Operand Loader
+		//
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_CALC_3) begin
+			pe_a    <= r_data_out;
+			pe_b    <= nn_data_out;
+			pe_t    <= (nn_addr == bram_addr_zero) ? {32{1'b0}} : t_data_out;
+			pe_c_in <= (r_addr  == bram_addr_zero) ? {32{1'b0}} : pe_c_out_dly[3];
+		end else begin
+			pe_a    <= {32{1'bX}};
+			pe_b    <= {32{1'bX}};
+			pe_t    <= {32{1'bX}};
+			pe_c_in <= {32{1'bX}};		
+		end
+	
 	
-	always @(posedge clk) sub_b_out_dly1 <= sub_b_out;
+		//
+		// B Shift Carry Logic
+		//
 		
-	always @(posedge clk) f0_data_out_carry_dly1 <= f0_data_out_carry;
-	always @(posedge clk) f0_data_out_carry_dly2 <= f0_data_out_carry_dly1;
+		/*
+		 * B value is repeatedly shifted to the left, so we need carry logic
+		 * to save the MSB of the current output word and feed into the LSB
+		 * of the next input word.
+		 *
+		 */
+		 
+	reg	b_data_out_carry;
 	
-	reg	flag_keep_f;
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+		
+				/* mask carry into the very first word */
+			FSM_STATE_CALC_2:
+				if ((nn_addr == bram_addr_zero) && (b_addr == bram_addr_zero))
+					b_data_out_carry <= 1'b0;
+					
+				/* carry feedback */
+			FSM_STATE_CALC_3:
+				if (nn_addr == bram_addr_zero)
+					b_data_out_carry <= b_data_out[31];
+					
+		endcase
+		
+		
+		//
+		// R Update Flag
+		//
+	reg	flag_update_r;
 	
+		/* indices of the target bit of T */
+	wire	[                   4:0]	flag_addr_bit  = cyc_cnt_next[4:0];
+	wire	[OPERAND_ADDR_WIDTH-1:0]	flag_addr_word	= cyc_cnt_next[OPERAND_ADDR_WIDTH+4:5];
+	
+		/* update flag when the target bit of T is available */
 	always @(posedge clk)
 		//
-		if (fsm_next_state == FSM_STATE_SAVE_1)
-			flag_keep_f <= sub_b_out_dly1 & ~f0_data_out_carry_dly2;
-	*/
+		if (t_wren && (t_addr == flag_addr_word))
+			flag_update_r <= t_data_in[flag_addr_bit];
+	
 	
-	always @* t_addr_rd = r_addr + nn_addr;
+		//
+		// Block Memory Address Logic
+		//
+
+	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr_calc1;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_calc1;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	t_addr_calc1;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	nn_addr_calc1;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	rr_addr_calc1;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	rb_addr_calc1;
 	
+		/* how to update R duing CALC_1 state */
+	always @*
+		//
+		if (fsm_state == FSM_STATE_INIT_5)					r_addr_calc1 <= bram_addr_zero;
+		else begin
+			if (r_addr < (n_num_words_latch - nn_addr))	r_addr_calc1 <= r_addr_next;
+			else														r_addr_calc1 <= bram_addr_zero;
+		end
+
+		/* how to update B, RR, RB duing CALC_1 state */
+	always @* begin
+		//
+		b_addr_calc1  = b_addr;
+		rr_addr_calc1 = rr_addr;
+		rb_addr_calc1 = rb_addr;
+		//
+		if ((fsm_state == FSM_STATE_INIT_5)	|| (fsm_state == FSM_STATE_SAVE_5)) begin
+			//
+			b_addr_calc1  = bram_addr_zero;
+			rr_addr_calc1 = bram_addr_zero;
+			rb_addr_calc1 = bram_addr_zero;
+			//
+		end else if (nn_addr == bram_addr_zero) begin
+			//
+			b_addr_calc1  = !b_addr_done  ? b_addr_next  : b_addr;
+			rr_addr_calc1 = !rr_addr_done ? rr_addr_next : rr_addr;
+			rb_addr_calc1 = !rb_addr_done ? rb_addr_next : rb_addr;
+			//
+		end
+		//
+	end
+
+		/* how to update T duing CALC_1 state */
+	always @*
+		//
+		if ((fsm_state == FSM_STATE_INIT_5) || (fsm_state == FSM_STATE_SAVE_5))
+			t_addr_calc1 = bram_addr_zero;
+		else begin
+			if (r_addr == (n_num_words_latch - nn_addr))
+				t_addr_calc1 = nn_addr_next;
+			else
+				t_addr_calc1 = t_addr_next;
+		end
+
+		/* how to update NN duing CALC_1 state */
+	always @* begin
+		//
+		nn_addr_calc1 = nn_addr;
+		//
+		if ((fsm_state == FSM_STATE_INIT_5) || (fsm_state == FSM_STATE_SAVE_5))
+			nn_addr_calc1 = bram_addr_zero;
+		else if (r_addr == (n_num_words_latch - nn_addr))
+			nn_addr_calc1 = nn_addr_next;
+		//
+	end
+
+
+		//
+		// Address Update Logic
+		//
 	always @(posedge clk) begin
 		//
+		// N
+		//
 		case (fsm_next_state)
-		
 			FSM_STATE_INIT_1:		n_addr <= bram_addr_zero;
-			
+			//
 			FSM_STATE_INIT_2,
 			FSM_STATE_INIT_3,
 			FSM_STATE_INIT_4,
 			FSM_STATE_INIT_5:		n_addr <= !n_addr_done ? n_addr_next : n_addr;
-			
 		endcase
 		//
-		case (fsm_next_state)
-			FSM_STATE_INIT_4:				nn_addr <= bram_addr_zero;
-			FSM_STATE_INIT_5:				nn_addr <= nn_addr_next;
-			FSM_STATE_CALC_1:
-				case (fsm_state)
-					FSM_STATE_INIT_5:		nn_addr <= bram_addr_zero;
-				endcase
-		endcase
+		// R
 		//
 		case (fsm_next_state)
 			FSM_STATE_INIT_4:		r_addr <= bram_addr_zero;
 			FSM_STATE_INIT_5:		r_addr <= r_addr_next;
-			FSM_STATE_CALC_1:		r_addr <= bram_addr_zero;
-			FSM_STATE_CALC_2,
-			FSM_STATE_CALC_3,
-			FSM_STATE_CALC_4:		r_addr <= r_addr_next;
-			
+			FSM_STATE_CALC_1:		r_addr <= r_addr_calc1;
+			FSM_STATE_SAVE_3:		r_addr <= bram_addr_zero;
+			//
+			FSM_STATE_SAVE_4,
+			FSM_STATE_SAVE_5:		r_addr <= r_addr_next;	
 		endcase
 		//
+		// B
+		//
 		case (fsm_next_state)
-			
 			FSM_STATE_INIT_4:		b_addr <= bram_addr_zero;
-			
 			FSM_STATE_INIT_5:		b_addr <= b_addr_next;
-			
+			FSM_STATE_CALC_1:		b_addr <= b_addr_calc1;
+		endcase
+		//
+		// T
+		//
+		case (fsm_next_state)			
+			FSM_STATE_CALC_1:		t_addr <= t_addr_calc1;			
+		endcase
+		//
+		// NN
+		//
+		case (fsm_next_state)
+			FSM_STATE_INIT_4:		nn_addr <= bram_addr_zero;
+			FSM_STATE_INIT_5:		nn_addr <= nn_addr_next;
+			FSM_STATE_CALC_1:		nn_addr <= nn_addr_calc1;
+		endcase
+		//
+		// RR
+		//
+		case (fsm_next_state)			
+			FSM_STATE_CALC_1:		rr_addr <= rr_addr_calc1;
+			FSM_STATE_SAVE_1:		rr_addr <= bram_addr_zero;
+			//
+			FSM_STATE_SAVE_2,
+			FSM_STATE_SAVE_3,
+			FSM_STATE_SAVE_4:		rr_addr <= !rr_addr_done ? rr_addr_next : rr_addr;	
+		endcase		
+		//
+		// RB
+		//
+		case (fsm_next_state)			
+			FSM_STATE_CALC_1:		rb_addr <= rb_addr_calc1;			
+			FSM_STATE_SAVE_1:		rb_addr <= bram_addr_zero;
+			//
+			FSM_STATE_SAVE_2,
+			FSM_STATE_SAVE_3,
+			FSM_STATE_SAVE_4:		rb_addr <= !rb_addr_done ? rb_addr_next : rb_addr;
+		endcase		
+		//
+		// N_COEFF
+		//
+		case (fsm_next_state)			
+			FSM_STATE_SAVE_3:		n_coeff_addr <= bram_addr_zero;
+			//
+			FSM_STATE_SAVE_4,
+			FSM_STATE_SAVE_5:		n_coeff_addr <= r_addr_next;
 		endcase
 		//
 	end
 
 
+		//
+		// Block Memory Write Enable Logic
+		//
 	always @(posedge clk) begin
 		//
-		case (fsm_next_state)			
-			FSM_STATE_INIT_4,
-			FSM_STATE_INIT_5:		nn_wren	<= 1'b1;
-			default:					nn_wren <= 1'b0;
-		endcase
+		// R
 		//
-		case (fsm_next_state)			
+		case (fsm_next_state)
 			FSM_STATE_INIT_4,
-			FSM_STATE_INIT_5:		r_wren	<= 1'b1;
+			FSM_STATE_INIT_5,
+			FSM_STATE_SAVE_3,
+			FSM_STATE_SAVE_4,
+			FSM_STATE_SAVE_5:		r_wren <= 1'b1;
 			default:					r_wren <= 1'b0;
 		endcase
 		//
+		// B
+		//
 		case (fsm_next_state)			
 			FSM_STATE_INIT_4,
-			FSM_STATE_INIT_5:		b_wren	<= 1'b1;
+			FSM_STATE_INIT_5:		b_wren <= 1'b1;
+			FSM_STATE_CALC_3:		b_wren <= (nn_addr == bram_addr_zero) ? 1'b1 : 1'b0;
 			default:					b_wren <= 1'b0;
 		endcase
-		/*
+		//
+		// T
+		//
 		case (fsm_next_state)			
+			FSM_STATE_CALC_5:		t_wren <= 1'b1;
+			default:					t_wren <= 1'b0;
+		endcase
+		//
+		// NN
+		//
+		case (fsm_next_state)			
+			FSM_STATE_INIT_4,
+			FSM_STATE_INIT_5:		nn_wren <= 1'b1;
+			default:					nn_wren <= 1'b0;
+		endcase
+		//
+		// RR
+		//
+		case (fsm_next_state)
+			FSM_STATE_CALC_5:		rr_wren <= (nn_addr == bram_addr_zero) ? 1'b1 : 1'b0;
+			default:					rr_wren <= 1'b0;
+		endcase
+		//
+		// RB
+		//
+		case (fsm_next_state)
+			FSM_STATE_CALC_5:		rb_wren <= (nn_addr == bram_addr_zero) ? 1'b1 : 1'b0;
+			default:					rb_wren <= 1'b0;
+		endcase
+		//
+		// N_COEFF
+		//
+		case (fsm_next_state)
 			FSM_STATE_SAVE_3,
 			FSM_STATE_SAVE_4,
-			FSM_STATE_SAVE_5:		f_wren <= cyc_cnt_done;
-			default:					f_wren <= 1'b0;
+			FSM_STATE_SAVE_5:		n_coeff_wren <= cyc_cnt_done;
+			default:					n_coeff_wren <= 1'b0;
 		endcase
-		*/
+		//
 	end
 	
+	
+		//
+		// Block Memory Input Logic
+		//
 	always @(posedge clk) begin
 		//
+		// R
+		//
 		case (fsm_next_state)
 			FSM_STATE_INIT_4,
-			FSM_STATE_INIT_5:		nn_data_in <= add_s;
-			default:					nn_data_in <= {32{1'bX}};
+			FSM_STATE_INIT_5:		r_data_in <= {{31{1'b0}}, add1_c_in_mask_dly};
+			//
+			FSM_STATE_SAVE_3,
+			FSM_STATE_SAVE_4,
+			FSM_STATE_SAVE_5:		r_data_in <= flag_update_r ? rb_data_out : rr_data_out;
+			default:					r_data_in <= {32{1'bX}};
 		endcase
 		//
+		// B
+		//
 		case (fsm_next_state)
 			FSM_STATE_INIT_4,
-			FSM_STATE_INIT_5:		r_data_in <= {{31{1'b0}}, add_c_in_mask_dly};
-			default:					r_data_in <= {32{1'bX}};
+			FSM_STATE_INIT_5:		b_data_in <= {{31{1'b0}}, add1_c_in_mask_dly};
+			FSM_STATE_CALC_3:		b_data_in <= (nn_addr == bram_addr_zero) ?
+				{b_data_out[30:0], b_data_out_carry} : {32{1'bX}};
+			default:					b_data_in <= {32{1'bX}};
 		endcase
 		//
+		// T
+		//
+		case (fsm_next_state)
+			FSM_STATE_CALC_5:		t_data_in <= pe_p;
+			default:					t_data_in <= {32{1'bX}};
+		endcase
+		//
+		// NN
+		//
 		case (fsm_next_state)
 			FSM_STATE_INIT_4,
-			FSM_STATE_INIT_5:		b_data_in <= {{31{1'b0}}, add_c_in_mask_dly};
-			default:					b_data_in <= {32{1'bX}};
+			FSM_STATE_INIT_5:		nn_data_in <= add1_s;
+			default:					nn_data_in <= {32{1'bX}};
 		endcase
-		/*
+		//
+		// RR
+		//
 		case (fsm_next_state)
-			FSM_STATE_CALC_3,
-			FSM_STATE_CALC_4,
-			FSM_STATE_CALC_5,
-			FSM_STATE_CALC_6:		f1_data_in <= f0_data_out_shifted;
-			default:					f1_data_in <= {32{1'bX}};
+			FSM_STATE_CALC_5:		rr_data_in <= r_data_out;
+			default:					rr_data_in <= {32{1'bX}};
 		endcase
 		//
+		// RB
+		//
 		case (fsm_next_state)
-			FSM_STATE_CALC_5,
-			FSM_STATE_CALC_6,
-			FSM_STATE_CALC_7,
-			FSM_STATE_CALC_8:		f2_data_in <= sub_d;
-			default:					f2_data_in <= {32{1'bX}};
+			FSM_STATE_CALC_5:		rb_data_in <= add2_s_dly[SYSTOLIC_PE_LATENCY-1];
+			default:					rb_data_in <= {32{1'bX}};
 		endcase
 		//
+		// N_COEFF
+		//
 		case (fsm_next_state)
 			FSM_STATE_SAVE_3,
 			FSM_STATE_SAVE_4,
-			FSM_STATE_SAVE_5:		f_data_in <= flag_keep_f ? f1_data_out : f2_data_out;
-			default:					f_data_in <= {32{1'bX}};
+			FSM_STATE_SAVE_5:		n_coeff_data_in <= flag_update_r ? rb_data_out : rr_data_out;
+			default:					n_coeff_data_in <= {32{1'bX}};
 		endcase
-		*/
+		//
 	end
 
-
 	
 		//
-		// FSM Transition Logic
+		// FSM Process
 		//
 	always @(posedge clk or negedge rst_n)
 		//
@@ -436,6 +783,9 @@ module modexpa7_n_coeff #
 		else						fsm_state <= fsm_next_state;
 	
 	
+		//
+		// FSM Transition Logic
+		//
 	always @* begin
 		//
 		fsm_next_state = FSM_STATE_STOP;
@@ -446,45 +796,28 @@ module modexpa7_n_coeff #
 										else							fsm_next_state = FSM_STATE_IDLE;
 												
 			FSM_STATE_INIT_1:										fsm_next_state = FSM_STATE_INIT_2;
-
 			FSM_STATE_INIT_2:										fsm_next_state = FSM_STATE_INIT_3;
-
 			FSM_STATE_INIT_3:										fsm_next_state = FSM_STATE_INIT_4;
-
 			FSM_STATE_INIT_4:										fsm_next_state = FSM_STATE_INIT_5;
-			
 			FSM_STATE_INIT_5:		if (nn_addr_done)			fsm_next_state = FSM_STATE_CALC_1;
 										else							fsm_next_state = FSM_STATE_INIT_5;
 
 			FSM_STATE_CALC_1:										fsm_next_state = FSM_STATE_CALC_2;
-
 			FSM_STATE_CALC_2:										fsm_next_state = FSM_STATE_CALC_3;
-
 			FSM_STATE_CALC_3:										fsm_next_state = FSM_STATE_CALC_4;
-			
-			FSM_STATE_CALC_4:										fsm_next_state = FSM_STATE_STOP;//FSM_STATE_CALC_5;
-			/*
-			FSM_STATE_CALC_5:										fsm_next_state = FSM_STATE_CALC_6;
-										
-			FSM_STATE_CALC_6:		if (f1_addr_done)			fsm_next_state = FSM_STATE_CALC_7;
-										else							fsm_next_state = FSM_STATE_CALC_6;
-										
-			FSM_STATE_CALC_7:										fsm_next_state = FSM_STATE_CALC_8;
-			
-			FSM_STATE_CALC_8:										fsm_next_state = FSM_STATE_SAVE_1;
+			FSM_STATE_CALC_4:		if (pe_latency_done)		fsm_next_state = FSM_STATE_CALC_5;
+										else							fsm_next_state = FSM_STATE_CALC_4;
+			FSM_STATE_CALC_5:		if (nn_addr_done)			fsm_next_state = FSM_STATE_SAVE_1;
+										else							fsm_next_state = FSM_STATE_CALC_1;
 			
 			FSM_STATE_SAVE_1:										fsm_next_state = FSM_STATE_SAVE_2;
-			
 			FSM_STATE_SAVE_2:										fsm_next_state = FSM_STATE_SAVE_3;
-			
 			FSM_STATE_SAVE_3:										fsm_next_state = FSM_STATE_SAVE_4;
-			
-			FSM_STATE_SAVE_4:		if (f12_addr_done_dly)	fsm_next_state = FSM_STATE_SAVE_5;
+			FSM_STATE_SAVE_4:		if (rb_addr_done_dly)	fsm_next_state = FSM_STATE_SAVE_5;
 										else							fsm_next_state = FSM_STATE_SAVE_4;
-
 			FSM_STATE_SAVE_5:		if (cyc_cnt_done)			fsm_next_state = FSM_STATE_STOP;
 										else							fsm_next_state = FSM_STATE_CALC_1;
-			*/						
+			
 			FSM_STATE_STOP:										fsm_next_state = FSM_STATE_IDLE;
 
 		endcase
diff --git a/src/rtl/pe/modexpa7_pe_mul.v b/src/rtl/pe/modexpa7_pe_mul.v
index e56d152..ff15981 100644
--- a/src/rtl/pe/modexpa7_pe_mul.v
+++ b/src/rtl/pe/modexpa7_pe_mul.v
@@ -47,34 +47,21 @@ module modexpa7_pe_mul
 		output	[31: 0]	c_out
 	);
 
-	reg	[31: 0]	a_reg1;
-	reg	[31: 0]	b_reg1;
-	reg	[31: 0]	t_reg1;
-	reg	[31: 0]	t_reg2;
-	reg	[31: 0]	t_reg3;
-	reg	[31: 0]	c_reg1;
-	reg	[31: 0]	c_reg2;
-	
-	reg	[63: 0]	ab_reg;
-	reg	[63: 0]	abc_reg;
-	reg	[63: 0]	abct_reg;
-	
-	assign p			= abct_reg[31: 0];
-	assign c_out	= abct_reg[63:32];
-	
-	always @(posedge clk) begin
-		a_reg1	<= a;
-		b_reg1	<= b;
-		c_reg1	<= c_in;
-		c_reg2	<= c_reg1;
-		t_reg1	<= t;
-		t_reg2	<= t_reg1;
-		t_reg3	<= t_reg2;
+	localparam LATENCY = 4;
 		
-		ab_reg	<= {{32{1'b0}}, a_reg1} * {{32{1'b0}}, b_reg1};
-		abc_reg	<= ab_reg  + {{32{1'b0}}, c_reg2};
-		abct_reg	<= abc_reg + {{32{1'b0}}, t_reg3};
-	end
+	reg	[63: 0]	abct[1:LATENCY];
+	
+	assign p			= abct[LATENCY][31: 0];
+	assign c_out	= abct[LATENCY][63:32];
+
+	wire	[63: 0]	ab = {{32{1'b0}}, a}    * {{32{1'b0}}, b};
+	wire	[63: 0]	ct = {{32{1'b0}}, c_in} + {{32{1'b0}}, t};
+
+	integer i;
+	always @(posedge clk)
+		//
+		for (i=1; i<=LATENCY; i=i+1)
+			abct[i] <= (i == 1) ? ab + ct : abct[i-1];
 
 endmodule
 
diff --git a/src/tb/tb_factor.v b/src/tb/tb_factor.v
index 53e6769..946883c 100644
--- a/src/tb/tb_factor.v
+++ b/src/tb/tb_factor.v
@@ -43,7 +43,7 @@ module tb_factor;
 		//
 		// Test Vectors
 		//
-	`include "../modexp_fpga_model_vectors.v";
+	`include "modexp_fpga_model_vectors.v";
 	
 		//
 		// Parameters
diff --git a/src/tb/tb_n_coeff.v b/src/tb/tb_n_coeff.v
index 6ab824a..269dc39 100644
--- a/src/tb/tb_n_coeff.v
+++ b/src/tb/tb_n_coeff.v
@@ -43,12 +43,13 @@ module tb_n_coeff;
 		//
 		// Test Vectors
 		//
-	`include "../modexp_fpga_model_vectors.v";
+	`include "modexp_fpga_model_vectors.v";
 	
 		//
 		// Parameters
 		//
 	localparam NUM_WORDS_384 = 384 / 32;
+	localparam NUM_WORDS_512 = 512 / 32;
 		
 		//
 		// Clock (100 MHz)
@@ -146,6 +147,7 @@ module tb_n_coeff;
 		#100;
 		
 		test_n_coeff_384(N_384);
+		test_n_coeff_512(N_512);
 		
 	end
       
@@ -169,9 +171,7 @@ module tb_n_coeff;
 				$finish;
 			end
 			
-			
-			n_num_words = 4'd11;								// set number of words
-			
+			n_num_words = 4'd11;								// set number of words	
 			write_memory_384(n);								// fill memory
 			
 			ena = 1;												// start operation
@@ -181,7 +181,7 @@ module tb_n_coeff;
 			while (!rdy) #10;									// wait for operation to complete
 			read_memory_384(result);						// get result from memory
 							
-			$display("    calculated: %x", result);	//
+			$display("    calculated: %x", result);	// display results
 			$display("    expected:   %x", n_coeff);	//
 							
 				// check calculated value
@@ -198,65 +198,159 @@ module tb_n_coeff;
 	endtask
 
 
-	task write_memory_384;
+	task test_n_coeff_512;
+		input	[511:0] n;
+		reg	[511:0] n_coeff;
+		reg	[511:0] result;
+		integer i;
+		begin
+						
+			calc_n_coeff_512(n, n_coeff);					// calculate n_coeff on-the-fly
+						
+				// make sure, that the value matches the one saved in the include file
+			if (n_coeff !== N_COEFF_512) begin
+				$display("ERROR: Calculated factor value differs from the one in the test vector!");
+				$finish;
+			end
+			
+			n_num_words = 4'd15;								// set number of words	
+			write_memory_512(n);								// fill memory
+			
+			ena = 1;												// start operation
+			#10;													//
+			ena = 0;												// clear flag
+			
+			while (!rdy) #10;									// wait for operation to complete
+			read_memory_512(result);						// get result from memory
+							
+			$display("    calculated: %x", result);	// display results
+			$display("    expected:   %x", n_coeff);	//
+							
+				// check calculated value
+			if (n_coeff === result) begin
+				$display("        OK");
+				$display("SUCCESS: Test passed.");
+			end else begin
+				$display("        ERROR");
+				$display("FAILURE: Test not passed.");
+			end
+		
+		end
+	
+	endtask
 
-		input	[383:0] n;
 
+		//
+		// write_memory_384
+		//
+	task write_memory_384;
+		//
+		input	[383:0] n;
 		reg	[383:0] n_shreg;
-		
+		//
 		begin
-			
-			tb_n_wren	= 1;														// start filling memories
-			
-			n_shreg       = n;													//
-			
-			for (w=0; w<NUM_WORDS_384; w=w+1) begin						// write all words
-				
-				tb_n_addr	= w[3:0];											// set addresses
-				
-				tb_n_data       = n_shreg[31:0];								//
-				
-				n_shreg       = {{32{1'bX}}, n_shreg[383:32]};			//
-				
-				#10;																	// wait for 1 clock tick
-				
+			//
+			tb_n_wren = 1;											// start filling memory
+			n_shreg = n;											// preset shift register
+			//
+			for (w=0; w<NUM_WORDS_384; w=w+1) begin		// write all words
+				tb_n_addr = w[3:0];								// set address
+				tb_n_data = n_shreg[31:0];						// set data
+				n_shreg = {{32{1'bX}}, n_shreg[383:32]};	// update shift register
+				#10;													// wait for 1 clock tick
 			end
-			
-			tb_n_addr	= {4{1'bX}};											// wipe addresses
-			
-			tb_n_data       = {32{1'bX}};										//
-			
-			tb_n_wren = 0;														// stop filling memories
-		
+			//
+			tb_n_addr = {4{1'bX}};								// wipe address
+			tb_n_data = {32{1'bX}};								// wipe data
+			tb_n_wren = 0;											// stop filling memory
+			//
 		end
-		
+		//
 	endtask
 			
 
+		//
+		// write_memory_512
+		//
+	task write_memory_512;
+		//
+		input	[511:0] n;
+		reg	[511:0] n_shreg;
+		//
+		begin
+			//
+			tb_n_wren = 1;											// start filling memory
+			n_shreg = n;											// preset shift register
+			//
+			for (w=0; w<NUM_WORDS_512; w=w+1) begin		// write all words
+				tb_n_addr = w[3:0];								// set address
+				tb_n_data = n_shreg[31:0];						// set data
+				n_shreg = {{32{1'bX}}, n_shreg[511:32]};	// update shift register
+				#10;													// wait for 1 clock tick
+			end
+			//
+			tb_n_addr = {4{1'bX}};								// wipe address
+			tb_n_data = {32{1'bX}};								// wipe data
+			tb_n_wren = 0;											// stop filling memory
+			//
+		end
+		//
+	endtask
+
+
+		//
+		// read_memory_384 
+		//
 	task read_memory_384;
-	
+		//
 		output	[383:0] n_coeff;
 		reg		[383:0] n_coeff_shreg;
-		
+		//
 		begin
-		
-				// read result word-by-word
-			for (w=0; w<NUM_WORDS_384; w=w+1) begin
-				tb_n_coeff_addr	= w[3:0];					// set address
-				#10;													// wait for 1 clock tick
+			//
+			for (w=0; w<NUM_WORDS_384; w=w+1) begin								// read result word-by-word
+				tb_n_coeff_addr	= w[3:0];											// set address
+				#10;																			// wait for 1 clock tick
 				n_coeff_shreg = {tb_n_coeff_data, n_coeff_shreg[383:32]};	// store data word
 			end
-			
-			tb_n_coeff_addr = {4{1'bX}};								// wipe address
-			n_coeff = n_coeff_shreg;											// return
-			
+			//
+			tb_n_coeff_addr = {4{1'bX}};												// wipe address
+			n_coeff = n_coeff_shreg;													// return
+			//
 		end
-		
+		//
 	endtask
 
 
-	task calc_n_coeff_384;
+		//
+		// read_memory_512
+		//
+	task read_memory_512;
+		//
+		output	[511:0] n_coeff;
+		reg		[511:0] n_coeff_shreg;
+		//
+		begin
+			//
+			for (w=0; w<NUM_WORDS_512; w=w+1) begin								// read result word-by-word
+				tb_n_coeff_addr	= w[3:0];											// set address
+				#10;																			// wait for 1 clock tick
+				n_coeff_shreg = {tb_n_coeff_data, n_coeff_shreg[511:32]};	// store data word
+			end
+			//
+			tb_n_coeff_addr = {4{1'bX}};												// wipe address
+			n_coeff = n_coeff_shreg;													// return
+			//
+		end
+		//
+	endtask
 
+
+		//
+		// calc_n_coeff_384
+		//
+	task calc_n_coeff_384;
+		//
 		input		[383:0] n;
 		output	[383:0] n_coeff;
 		reg		[383:0] r;
@@ -264,24 +358,63 @@ module tb_n_coeff;
 		reg		[383:0] t;
 		reg		[383:0] b;
 		integer i;
-
+		//
 		begin
-
-			r = 384'd1;
-			b = 384'd1;
+			//
+			r  = 384'd1;
+			b  = 384'd1;
 			nn = ~n + 1'b1;
-
+			//
 			for (i=1; i<384; i=i+1) begin
+				//
 				b = {b[382:0], 1'b0};
 				t = r * nn;
+				//
 				if (t[i] == 1'b1)
 					r = r + b;
+				//
 			end
-
+			//
 			n_coeff = r;
-
+			//
 		end
+		//
+	endtask
+
 
+		//
+		// calc_n_coeff_512
+		//
+	task calc_n_coeff_512;
+		//
+		input		[511:0] n;
+		output	[511:0] n_coeff;
+		reg		[511:0] r;
+		reg		[511:0] nn;
+		reg		[511:0] t;
+		reg		[511:0] b;
+		integer i;
+		//
+		begin
+			//
+			r  = 512'd1;
+			b  = 512'd1;
+			nn = ~n + 1'b1;
+			//
+			for (i=1; i<512; i=i+1) begin
+				//
+				b = {b[510:0], 1'b0};
+				t = r * nn;
+				//
+				if (t[i] == 1'b1)
+					r = r + b;
+				//
+			end
+			//
+			n_coeff = r;
+			//
+		end
+		//
 	endtask
 
 
diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v
index 3cbb8d1..a6380e5 100644
--- a/src/tb/tb_systolic_multiplier.v
+++ b/src/tb/tb_systolic_multiplier.v
@@ -44,7 +44,7 @@ module tb_systolic_multiplier;
 		//
 		// Test Vectors
 		//
-	`include "../modexp_fpga_model_vectors.v";
+	`include "modexp_fpga_model_vectors.v";
 	
 	
 		//

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.


More information about the Commits mailing list