[Cryptech-Commits] [core/math/modexpa7] branch systolic updated: * Moved systolic processing element array into a separate module. * Finished top-level wrapper module.

git at cryptech.is git at cryptech.is
Sun Aug 6 18:47:40 UTC 2017


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch systolic
in repository core/math/modexpa7.

The following commit(s) were added to refs/heads/systolic by this push:
     new f96ad01   * Moved systolic processing element array into a separate module.  * Finished top-level wrapper module.
f96ad01 is described below

commit f96ad01980fc4d0ed40f6ffb0fbb7c2006421c18
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Sun Aug 6 21:46:35 2017 +0300

     * Moved systolic processing element array into a separate module.
     * Finished top-level wrapper module.
---
 src/rtl/modexpa7_exponentiator.v             |    4 +-
 src/rtl/modexpa7_systolic_multiplier.v       |  424 ++++++++-
 src/rtl/modexpa7_systolic_multiplier_array.v |   72 +-
 src/rtl/modexpa7_systolic_multiplier_fix.v   | 1202 ------------------------
 src/rtl/modexpa7_systolic_multiplier_old.v   | 1260 --------------------------
 src/rtl/modexpa7_wrapper.v                   |  130 ++-
 src/tb/tb_exponentiator.v                    |    4 +-
 src/tb/tb_systolic_multiplier.v              |    4 +-
 src/tb/tb_wrapper.v                          |  123 ++-
 9 files changed, 638 insertions(+), 2585 deletions(-)

diff --git a/src/rtl/modexpa7_exponentiator.v b/src/rtl/modexpa7_exponentiator.v
index cda6882..b33360a 100644
--- a/src/rtl/modexpa7_exponentiator.v
+++ b/src/rtl/modexpa7_exponentiator.v
@@ -665,7 +665,7 @@ module modexpa7_exponentiator #
 		.r_bram_in				(pp_data_in),
 		.r_bram_wr				(pp_wren),
 
-		.ab_num_words			(m_num_words_latch)
+		.n_num_words			(m_num_words_latch)
 	);
 
 	modexpa7_systolic_multiplier #
@@ -695,7 +695,7 @@ module modexpa7_exponentiator #
 		.r_bram_in				(tp_data_in),
 		.r_bram_wr				(tp_wren),
 
-		.ab_num_words			(m_num_words_latch)
+		.n_num_words			(m_num_words_latch)
 	);
 	
 	
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
index 32ed543..7293998 100644
--- a/src/rtl/modexpa7_systolic_multiplier.v
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -96,14 +96,26 @@ module modexpa7_systolic_multiplier #
 	localparam	[ 7: 0]	FSM_STATE_MULT_START		= 8'h21;
 	localparam	[ 7: 0]	FSM_STATE_MULT_CRUNCH	= 8'h22;
 	localparam	[ 7: 0]	FSM_STATE_MULT_FINAL		= 8'h23;
+	
+	localparam	[ 7: 0]	FSM_STATE_ADD_START		= 8'h31;
+	localparam	[ 7: 0]	FSM_STATE_ADD_CRUNCH		= 8'h32;
+	localparam	[ 7: 0]	FSM_STATE_ADD_UNLOAD		= 8'h33;
+	localparam	[ 7: 0]	FSM_STATE_SUB_UNLOAD		= 8'h34;
+	localparam	[ 7: 0]	FSM_STATE_ADD_FINAL		= 8'h35;
+	
+	localparam	[ 7: 0]	FSM_STATE_SAVE_START		= 8'h41;
+	localparam	[ 7: 0]	FSM_STATE_SAVE_WRITE		= 8'h42;
+	localparam	[ 7: 0]	FSM_STATE_SAVE_FINAL		= 8'h43;
 	
 	localparam	[ 7: 0]	FSM_STATE_STOP				= 8'hFF;
+	
 	
 		/*
-		 * FSM State / Next State
+		 * FSM State / Next State / Previous State
 		 */
 	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
 	reg	[ 7: 0]	fsm_next_state;
+	reg	[ 7: 0]	fsm_prev_state;
 
 
 		/*
@@ -153,6 +165,31 @@ module modexpa7_systolic_multiplier #
 			
 			
 		/*
+		 * Multiplication Phase
+		 */
+	localparam	[ 1: 0]	MULT_PHASE_A_B				= 2'd1;
+	localparam	[ 1: 0]	MULT_PHASE_AB_N_COEFF	= 2'd2;
+	localparam	[ 1: 0]	MULT_PHASE_Q_N				= 2'd3;
+	localparam	[ 1: 0]	MULT_PHASE_STALL			= 2'd0;
+	
+	reg	[ 1: 0]	mult_phase;
+	
+	wire	mult_phase_done = (mult_phase == MULT_PHASE_STALL) ? 1'b1 : 1'b0;
+	
+   always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_LOAD_START:	if (ena_trig)	mult_phase <= MULT_PHASE_A_B;
+			FSM_STATE_MULT_FINAL:
+				case (mult_phase)
+					MULT_PHASE_A_B:						mult_phase <= MULT_PHASE_AB_N_COEFF;
+					MULT_PHASE_AB_N_COEFF:				mult_phase <= MULT_PHASE_Q_N;
+					MULT_PHASE_Q_N:						mult_phase <= MULT_PHASE_STALL;
+				endcase
+		endcase
+	
+			
+		/*
 		 * Counters
 		 */
 			
@@ -258,41 +295,130 @@ module modexpa7_systolic_multiplier #
 	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last     = {n_num_words_latch};
 	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_last = {n_num_words_latch, 1'b1};
 
-		// address registers
+		// address registers
 	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr;
 	wire	[OPERAND_ADDR_WIDTH  :0]	p_addr_ext_wr;
-	reg	[OPERAND_ADDR_WIDTH  :0]	p_addr_ext_rd;
+	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_wr;
+	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_rd;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_wr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_rd;
+	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_wr;
+	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_rd;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	s_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr;
 		
 		// handy increment values
-	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next				= b_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH  :0]	p_addr_ext_rd_next	= b_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next				= b_addr         + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next				= n_addr         + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_rd_next	= ab_addr_ext_rd + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_rd_next			= q_addr_rd      + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_rd_next	= qn_addr_ext_rd + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_addr_next				= s_addr         + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_addr_next			= sn_addr        + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next				= r_addr         + 1'b1;
 	
 		// write enables
 	wire	p_wren;
+	wire	ab_wren;
+	wire	q_wren;
+	wire	qn_wren;
+	reg	s_wren;
+	reg	sn_wren;
+	reg	r_wren;
 	
 		// data buses
 	wire	[31: 0]	p_data_in;
-	wire	[31: 0]	p_data_out;
+	wire	[31: 0]	ab_data_in;
+	wire	[31: 0]	ab_data_out;
+	wire	[31: 0]	q_data_in;
+	wire	[31: 0]	q_data_out;
+	wire	[31: 0]	qn_data_in;
+	wire	[31: 0]	qn_data_out;
+	wire	[31: 0]	s_data_in;
+	wire	[31: 0]	s_data_out;
+	wire	[31: 0]	sn_data_in;
+	wire	[31: 0]	sn_data_out;
+	wire	[31: 0]	r_data_in;
 	
 		// handy stop flags
-	wire	b_addr_done        = (b_addr        == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	p_addr_ext_rd_done = (p_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;
+	wire	b_addr_done				= (b_addr         == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	n_addr_done				= (n_addr         == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	ab_addr_ext_rd_done	= (ab_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;
+	wire	q_addr_rd_done			= (q_addr_rd      == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	qn_addr_ext_rd_done	= (qn_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;
+	wire	s_addr_done				= (s_addr         == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	sn_addr_done			= (sn_addr        == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	r_addr_done				= (r_addr         == bram_addr_last)     ? 1'b1 : 1'b0;
 
 		// delayed addresses
 	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr_dly;
+	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_rd_dly;
+	reg	[OPERAND_ADDR_WIDTH : 0]	qn_addr_ext_rd_dly1;
+	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_rd_dly2;
+	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_rd_dly3;
 	
-	always @(posedge clk) b_addr_dly <= b_addr;
-
+	always @(posedge clk) b_addr_dly				<= b_addr;
+	always @(posedge clk) n_addr_dly				<= n_addr;
+	always @(posedge clk) ab_addr_ext_rd_dly	<= ab_addr_ext_rd;
+	always @(posedge clk) qn_addr_ext_rd_dly1 <= qn_addr_ext_rd;
+	always @(posedge clk) qn_addr_ext_rd_dly2 <= qn_addr_ext_rd_dly1;
+	always @(posedge clk) qn_addr_ext_rd_dly3 <= qn_addr_ext_rd_dly2;
 				
 		// map registers to top-level ports
 	assign b_bram_addr = b_addr;
+	assign n_bram_addr = n_addr;
+	assign r_bram_addr = r_addr;
+
+		// map
+	assign ab_addr_ext_wr	= p_addr_ext_wr[OPERAND_ADDR_WIDTH  :0];
+	assign q_addr_wr			= p_addr_ext_wr[OPERAND_ADDR_WIDTH-1:0];
+	assign qn_addr_ext_wr	= p_addr_ext_wr[OPERAND_ADDR_WIDTH  :0];
+	assign r_bram_wr			= r_wren;
+	
+	assign ab_data_in		= p_data_in;
+	assign q_data_in		= p_data_in;
+	assign qn_data_in		= p_data_in;
+	assign r_bram_in		= r_data_in;
+	
+	assign ab_wren		= p_wren && (mult_phase == MULT_PHASE_A_B);
+	assign q_wren		= p_wren && (mult_phase == MULT_PHASE_AB_N_COEFF);
+	assign qn_wren		= p_wren && (mult_phase == MULT_PHASE_Q_N);
+		
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+	bram_ab
+	(	.clk(clk),
+		.a_addr(ab_addr_ext_wr), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(),
+		.b_addr(ab_addr_ext_rd), .b_out(ab_data_out)
+	);
 
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_q
+	(	.clk(clk),
+		.a_addr(q_addr_wr), .a_wr(q_wren), .a_in(q_data_in), .a_out(),
+		.b_addr(q_addr_rd), .b_out(q_data_out)
+	);
 
 	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
-	bram_p
+	bram_qn
 	(	.clk(clk),
-		.a_addr(p_addr_ext_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(),
-		.b_addr(p_addr_ext_rd), .b_out(p_data_out)
+		.a_addr(qn_addr_ext_wr), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(),
+		.b_addr(qn_addr_ext_rd), .b_out(qn_data_out)
+	);
+
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_s
+	(	.clk(clk),
+		.a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out)
+	);
+
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_sn
+	(	.clk(clk),
+		.a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out)
 	);
 
 				
@@ -308,10 +434,24 @@ module modexpa7_systolic_multiplier #
 			//
 			FSM_STATE_LOAD_SHIFT: begin
 		
-						// update the rightmost part of loader buffer
-				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
+					// update the rightmost part of loader buffer
+				case (mult_phase)
 				
-						// shift the loader buffer to the left
+					MULT_PHASE_A_B:
+						loader_din[SYSTOLIC_ARRAY_LENGTH-1] <=
+							(b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
+							
+					MULT_PHASE_AB_N_COEFF:
+						loader_din[SYSTOLIC_ARRAY_LENGTH-1] <=
+							(ab_addr_ext_rd_dly <= {1'b0, bram_addr_last}) ? ab_data_out : {32{1'b0}};
+							
+					MULT_PHASE_Q_N:
+						loader_din[SYSTOLIC_ARRAY_LENGTH-1] <=
+							(n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
+							
+				endcase
+				
+					// shift the loader buffer to the left
 				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
 					loader_din[j-1] <= loader_din[j];
 					
@@ -348,16 +488,60 @@ module modexpa7_systolic_multiplier #
 				loader_addr_wr <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
 					
 		endcase
-	
 
 		/*
+		 * Flag
+		 */
+	reg flag_select_s;
+	
+	assign r_data_in = flag_select_s ? s_data_out : sn_data_out;
+	
+	
+		/*
 		 * Memory Address Control Logic
 		 */
 	always @(posedge clk) begin
 		//
-		case (fsm_next_state)
-			FSM_STATE_LOAD_START:	b_addr <= bram_addr_zero;
-			FSM_STATE_LOAD_SHIFT:	b_addr <= b_addr_next;
+		case (fsm_next_state)
+		
+			FSM_STATE_LOAD_START: begin
+				ab_addr_ext_rd		<= bram_addr_ext_zero;
+			end
+								
+			FSM_STATE_LOAD_SHIFT: begin
+				ab_addr_ext_rd		<= ab_addr_ext_rd_next;
+			end
+		
+			FSM_STATE_ADD_START: begin
+				ab_addr_ext_rd		<= bram_addr_ext_zero;
+				qn_addr_ext_rd		<= bram_addr_ext_zero;
+			end
+			
+			FSM_STATE_ADD_CRUNCH: begin
+				ab_addr_ext_rd		<= ab_addr_ext_rd_next;
+				qn_addr_ext_rd		<= qn_addr_ext_rd_next;
+			end
+				
+		endcase
+		//
+		case (fsm_next_state)
+	
+			FSM_STATE_LOAD_START: begin
+				b_addr				<= bram_addr_zero;
+				n_addr				<= bram_addr_zero;
+			end
+								
+			FSM_STATE_LOAD_SHIFT: begin
+				b_addr 				<= b_addr_next;
+				n_addr				<= n_addr_next;
+			end
+					
+			FSM_STATE_ADD_CRUNCH,
+			FSM_STATE_ADD_UNLOAD: begin
+				if (qn_addr_ext_rd_dly1 == {1'b0, bram_addr_last})			n_addr <= bram_addr_zero;
+				else if (qn_addr_ext_rd_dly1 >  {1'b0, bram_addr_last})	n_addr <= n_addr_next;				
+			end
+			
 		endcase
 		//
 	end
@@ -378,10 +562,27 @@ module modexpa7_systolic_multiplier #
 		
 	always @(posedge clk)
 		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_START:	p_num_words_latch <= {n_num_words_latch, 1'b1};
-		endcase
+		if (fsm_next_state == FSM_STATE_MULT_START)
+			//
+			case (mult_phase)
+				MULT_PHASE_A_B:			p_num_words_latch <= {n_num_words_latch, 1'b1};
+				MULT_PHASE_AB_N_COEFF:	p_num_words_latch <= {1'b0, n_num_words_latch};
+				MULT_PHASE_Q_N:			p_num_words_latch <= {n_num_words_latch, 1'b1};
+			endcase
 			
+	assign n_coeff_bram_addr = a_bram_addr;
+	assign q_addr_rd = a_bram_addr;
+	
+	reg	[31: 0]	a_data_out;
+	
+	always @*
+		//
+		case (mult_phase)
+			MULT_PHASE_A_B:			a_data_out = a_bram_out;
+			MULT_PHASE_AB_N_COEFF:	a_data_out = n_coeff_bram_out;
+			MULT_PHASE_Q_N:			a_data_out = q_data_out;
+			default:						a_data_out = {32{1'bX}};
+		endcase
 	
 	modexpa7_systolic_multiplier_array #
 	(
@@ -398,7 +599,7 @@ module modexpa7_systolic_multiplier #
 
 		.loader_addr_rd	(loader_addr_rd),
 		
-		.pe_a_wide			({SYSTOLIC_ARRAY_LENGTH{a_bram_out}}),
+		.pe_a_wide			({SYSTOLIC_ARRAY_LENGTH{a_data_out}}),
 		.pe_b_wide			(pe_b_wide),
 		
 		.a_bram_addr		(a_bram_addr),
@@ -411,24 +612,174 @@ module modexpa7_systolic_multiplier #
 		.p_num_words		(p_num_words_latch)
 	);
 	
+		/*
+		 * Adder
+		 */
+		 
+	reg				add1_ce;					// clock enable
+	wire	[31: 0]	add1_s;					// sum output
+	wire				add1_c_in;				// carry input
+	wire	[31: 0]	add1_a;					// A-input
+	wire	[31: 0]	add1_b;					// B-input
+	reg				add1_c_in_mask;		// flag to not carry anything into the very first word
+	wire				add1_c_out;				// carry output
 
+	modexpa7_adder32 add1_inst
+	(
+		.clk		(clk),
+		.ce		(add1_ce),
+		.a			(add1_a),
+		.b			(add1_b),
+		.c_in		(add1_c_in),
+		.s			(add1_s),
+		.c_out	(add1_c_out)
+	);
 
+		/*
+		 * Subtractor
+		 */		 
+	reg				sub1_ce;					// clock enable
+	wire	[31: 0]	sub1_d;					// difference output
+	wire				sub1_b_in;				// borrow input
+	wire	[31: 0]	sub1_a;					// A-input
+	wire	[31: 0]	sub1_b;					// B-input
+	reg				sub1_b_in_mask;		// flag to not borrow anything from the very first word
+	wire				sub1_b_out;				// borrow output
 
+	modexpa7_subtractor32 sub1_inst
+	(
+		.clk		(clk),
+		.ce		(sub1_ce),
+		.a			(sub1_a),
+		.b			(sub1_b),
+		.b_in		(sub1_b_in),
+		.d			(sub1_d),
+		.b_out	(sub1_b_out)
+	);
+	
+		// add masking into carry feedback chain
+	assign add1_c_in = add1_c_out & ~add1_c_in_mask;
 
+		// add masking into borrow feedback chain
+	assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
+
+		// mask carry for the very first words of AB and QN
+	always @(posedge clk)
+		//
+		add1_c_in_mask <= (fsm_state == FSM_STATE_ADD_START) ? 1'b1 : 1'b0;
 
+		// mask borrow for the very first words of S and N
+	always @(posedge clk)
+		//
+		sub1_b_in_mask <= add1_c_in_mask;
+			
+	
+		// map adder inputs
+	assign add1_a = ab_data_out;
+	assign add1_b = qn_data_out;
+	
+		// map subtractor inputs
+	assign sub1_a = add1_s;
+	assign sub1_b = (qn_addr_ext_rd_dly2 <= {1'b0, bram_addr_last}) ? 32'd0 : n_bram_out;
+	
+		// clock enable
+	always @(posedge clk) begin
+		//
+		case (fsm_state)
+			FSM_STATE_ADD_START,
+			FSM_STATE_ADD_CRUNCH:	add1_ce <= 1'b1;
+			default:						add1_ce <= 1'b0;
+		endcase
+		//
+		sub1_ce <= add1_ce;
+		//
+	end
+		
+		// map outputs
+	assign s_data_in = add1_s;
+	assign sn_data_in = sub1_d;
+		
+		// write enabled
+	always @(posedge clk) begin
+		//
+		case (fsm_state)
+			FSM_STATE_ADD_CRUNCH,
+			FSM_STATE_ADD_UNLOAD:	s_wren <= 1'b1;
+			default:						s_wren <= 1'b0;		
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_ADD_CRUNCH,
+			FSM_STATE_ADD_UNLOAD,
+			FSM_STATE_SUB_UNLOAD,
+			FSM_STATE_ADD_FINAL:		sn_wren <= s_wren;
+			default:						sn_wren <= 1'b0;
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_START,
+			FSM_STATE_SAVE_WRITE:	r_wren <= 1'b1;
+			default:						r_wren <= 1'b0;
+		endcase
+		//
+	end
+
+		// ...
+	always @(posedge clk) begin
+		//
+		case (fsm_state)
+			FSM_STATE_ADD_CRUNCH,
+			FSM_STATE_ADD_UNLOAD: begin
+					if (qn_addr_ext_rd_dly1 == {1'b0, bram_addr_zero})			s_addr <= bram_addr_zero;
+					else if (qn_addr_ext_rd_dly2 >  {1'b0, bram_addr_last})	s_addr <= s_addr_next;
+				end
+			FSM_STATE_ADD_FINAL:															s_addr <= bram_addr_zero;
+			FSM_STATE_SAVE_START,
+			FSM_STATE_SAVE_WRITE:														s_addr <= s_addr_next;
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_ADD_CRUNCH,
+			FSM_STATE_ADD_UNLOAD,
+			FSM_STATE_SUB_UNLOAD: begin
+					if (qn_addr_ext_rd_dly2 == {1'b0, bram_addr_zero})			sn_addr <= bram_addr_zero;
+					else if (qn_addr_ext_rd_dly3 >  {1'b0, bram_addr_last})	sn_addr <= sn_addr_next;
+				end
+			FSM_STATE_ADD_FINAL:															sn_addr <= bram_addr_zero;
+			FSM_STATE_SAVE_START,
+			FSM_STATE_SAVE_WRITE:														sn_addr <= sn_addr_next;
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_START:	r_addr <= bram_addr_zero;
+			FSM_STATE_SAVE_WRITE:	r_addr <= r_addr_next;
+		endcase
+		//
+	end
+
+		
+		/*
+		 * Flag Update Logic
+		 */
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_ADD_FINAL)
+			flag_select_s <= sub1_b_out & ~add1_c_out;
 
 
 
-
-		
 			
 		/*
 		 * FSM Process
-	-	 */
+		 */
 	always @(posedge clk or negedge rst_n)
-		//
+		//
 		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
-		else						fsm_state <= fsm_next_state;
+		else						fsm_state <= fsm_next_state;
+
+	always @(posedge clk)
+		//
+		fsm_prev_state <= fsm_state;
 	
 	
 		/*
@@ -453,7 +804,20 @@ module modexpa7_systolic_multiplier #
 			FSM_STATE_MULT_START:											fsm_next_state = FSM_STATE_MULT_CRUNCH;
 			FSM_STATE_MULT_CRUNCH:		if (pe_array_rdy)				fsm_next_state = FSM_STATE_MULT_FINAL;
 												else								fsm_next_state = FSM_STATE_MULT_CRUNCH;
-			FSM_STATE_MULT_FINAL:											fsm_next_state = FSM_STATE_STOP;
+			FSM_STATE_MULT_FINAL:		if (mult_phase_done)			fsm_next_state = FSM_STATE_ADD_START;
+												else								fsm_next_state = FSM_STATE_LOAD_START;
+			//
+			FSM_STATE_ADD_START:												fsm_next_state = FSM_STATE_ADD_CRUNCH;
+			FSM_STATE_ADD_CRUNCH:		if (ab_addr_ext_rd_done)	fsm_next_state = FSM_STATE_ADD_UNLOAD;
+												else								fsm_next_state = FSM_STATE_ADD_CRUNCH;
+			FSM_STATE_ADD_UNLOAD:											fsm_next_state = FSM_STATE_SUB_UNLOAD;
+			FSM_STATE_SUB_UNLOAD:											fsm_next_state = FSM_STATE_ADD_FINAL;
+			FSM_STATE_ADD_FINAL:												fsm_next_state = FSM_STATE_SAVE_START;
+			//
+			FSM_STATE_SAVE_START:											fsm_next_state = FSM_STATE_SAVE_WRITE;
+			FSM_STATE_SAVE_WRITE:		if (s_addr_done)				fsm_next_state = FSM_STATE_SAVE_FINAL;
+												else								fsm_next_state = FSM_STATE_SAVE_WRITE;
+			FSM_STATE_SAVE_FINAL:											fsm_next_state = FSM_STATE_STOP;
 			//
 			FSM_STATE_STOP:													fsm_next_state = FSM_STATE_IDLE;
 			//
diff --git a/src/rtl/modexpa7_systolic_multiplier_array.v b/src/rtl/modexpa7_systolic_multiplier_array.v
index 22d5aaf..754203d 100644
--- a/src/rtl/modexpa7_systolic_multiplier_array.v
+++ b/src/rtl/modexpa7_systolic_multiplier_array.v
@@ -195,11 +195,15 @@ module modexpa7_systolic_multiplier_array #
 
 	wire	shreg_done_load		= shreg_load[syst_cnt_last];
 	wire	shreg_done_latency	= shreg_latency[SYSTOLIC_PE_LATENCY];
-	wire	shreg_done_unload		= shreg_unload[syst_cnt_last];
-
+	wire	shreg_done_unload		= shreg_unload[syst_cnt_last];
+	
 	reg	shreg_now_loading;
 	reg	shreg_now_latency;
 	reg	shreg_now_unloading;
+
+	reg	shreg_done_latency_dly;
+	always @(posedge clk)
+		shreg_done_latency_dly <= shreg_done_latency;
 	
 	always @(posedge clk)
 		//
@@ -257,17 +261,22 @@ module modexpa7_systolic_multiplier_array #
 	reg	fifo_c_rst;
 	reg	fifo_t_rst;
 
-	wire	fifo_c_wren;
+	reg	fifo_c_wren;
 	wire	fifo_c_rden;
 	
-	wire	fifo_t_wren;
+	reg	fifo_t_wren;
 	wire	fifo_t_rden;
 		
-	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_c_din;
+	reg	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_c_din;
 	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_c_dout;
 	
 	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_t_din;
 	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_t_dout;
+	
+	wire	[32 *                          1  - 1 : 0]	fifo_t_din_msb;
+	reg	[32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0]	fifo_t_din_lsb;
+	
+	assign fifo_t_din = {fifo_t_din_msb, fifo_t_din_lsb};
 	
 	modexpa7_simple_fifo #
 	(
@@ -317,10 +326,26 @@ module modexpa7_systolic_multiplier_array #
 			//
 			assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
 			assign pe_t[i]    = fifo_t_dout[32 * (i + 1) - 1 -: 32];
+			//
+			always @(posedge clk)
+				fifo_c_din[32 * (i + 1) - 1 -: 32] <= pe_c_out[i];
+			//
+		end
+		//
+	endgenerate
+	
+	generate for (i=1; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
+		//
+		begin : gen_modexpa7_fifo_t_lsb
+			//
+			always @(posedge clk)
+				fifo_t_din_lsb[32 * i - 1 -: 32] <= pe_p[i];
 			//
 		end
 		//
-	endgenerate
+	endgenerate
+	
+	assign fifo_t_din_msb = shreg_now_unloading ? pe_p[0] : 32'd0;
 
 
 		/*
@@ -340,6 +365,15 @@ module modexpa7_systolic_multiplier_array #
 			FSM_STATE_MULT_CRUNCH:	if (shreg_done_load)		fifo_t_rst <= 1'b0;
 		endcase
 
+		/*
+		 *
+		 */
+	assign fifo_c_rden = shreg_now_loading;
+	assign fifo_t_rden = shreg_now_loading;
+
+	always @(posedge clk) fifo_c_wren <= shreg_now_unloading;
+	always @(posedge clk) fifo_t_wren <= shreg_now_unloading;
+
 
 		/*
 		 * Block Memory Interface
@@ -390,16 +424,22 @@ module modexpa7_systolic_multiplier_array #
 
 
 
-//		/*
-//		 *
-//		 */
-//	always @(posedge clk)
-//		//
-//		case (fsm_next_state)
-//			FSM_STATE_MULT_RELOAD:	p_wren <= 1'b1;
-//			default:						p_wren <= 1'b0;
-//		endcase
-//
+		/*
+		 *
+		 */
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_RELOAD:	p_wren <= 1'b1;
+			default:						p_wren <= 1'b0;
+		endcase
+
+
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_MULT_CRUNCH) && shreg_done_latency_dly)
+			p_data_in <= pe_p[0];
+
 		/*
 		 * Block Memory Address Control
 		 */
diff --git a/src/rtl/modexpa7_systolic_multiplier_fix.v b/src/rtl/modexpa7_systolic_multiplier_fix.v
deleted file mode 100644
index 40b2144..0000000
--- a/src/rtl/modexpa7_systolic_multiplier_fix.v
+++ /dev/null
@@ -1,1202 +0,0 @@
-//======================================================================
-//
-// modexpa7_systolic_multiplier.v
-// -----------------------------------------------------------------------------
-// Systolic Montgomery multiplier.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2017, NORDUnet A/S All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may
-//   be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//======================================================================
-
-module modexpa7_systolic_multiplier #
-	(
-			//
-			// This sets the address widths of memory buffers. Internal data
-			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
-			// 2048 / 32 = 64 words, and these need 6-bit address bus, because
-			// 2 ** 6 = 64.
-			//
-		parameter	OPERAND_ADDR_WIDTH		= 4,
-		
-			//
-			// Explain.
-			//
-		parameter	SYSTOLIC_ARRAY_POWER		= 1
-	)
-	(
-		input											clk,
-		input											rst_n,
-
-		input											ena,
-		output										rdy,
-
-		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
-		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
-		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
-		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
-		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,
-
-		input		[                32-1:0]	a_bram_out,
-		input		[                32-1:0]	b_bram_out,
-		input		[                32-1:0]	n_bram_out,
-		input		[                32-1:0]	n_coeff_bram_out,
-
-		output	[                32-1:0]	r_bram_in,
-		output										r_bram_wr,
-
-		input		[OPERAND_ADDR_WIDTH-1:0]	ab_num_words
-	);
-	
-		
-		//
-		// Include Settings
-		//
-	`include "pe/modexpa7_primitive_switch.v"
-	`include "modexpa7_settings.v"
-		
-
-		//
-		// FSM Declaration
-		//
-	localparam	[ 7: 0]	FSM_STATE_IDLE								= 8'h00;
-
-	localparam	[ 7: 0]	FSM_STATE_LOAD_B_START					= 8'h11;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_B_SHIFT					= 8'h12;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_B_WRITE					= 8'h13;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_B_FINAL					= 8'h14;
-
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_START			= 8'h21;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_SHIFT			= 8'h22;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_WRITE			= 8'h23;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_FINAL			= 8'h24;
-
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_START					= 8'h31;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_SHIFT					= 8'h32;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_WRITE					= 8'h33;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_FINAL					= 8'h34;
-
-	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_START				= 8'h41;
-	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_CRUNCH				= 8'h42;
-	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_RELOAD				= 8'h43;
-	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_FINAL				= 8'h44;
-
-	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_START		= 8'h51;
-	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_CRUNCH		= 8'h52;
-	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_RELOAD		= 8'h53;
-	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_FINAL		= 8'h54;
-
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_START				= 8'h61;
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_CRUNCH				= 8'h62;
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_RELOAD				= 8'h63;
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_FINAL				= 8'h64;
-	
-	localparam	[ 7: 0]	FSM_STATE_SAVE_START						= 8'h71;
-	localparam	[ 7: 0]	FSM_STATE_SAVE_WRITE						= 8'h72;
-	localparam	[ 7: 0]	FSM_STATE_SAVE_FINAL						= 8'h73;	
-	
-	localparam	[ 7: 0]	FSM_STATE_STOP								= 8'hFF;
-	
-		//
-		// FSM State / Next State
-		//
-	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
-	reg	[ 7: 0]	fsm_next_state;
-
-
-		//
-		// Enable Delay and Trigger
-		//
-   reg ena_dly = 1'b0;
-	
-		/* delay enable by one clock cycle */
-   always @(posedge clk) ena_dly <= ena;
-
-		/* trigger new operation when enable goes high */
-   wire ena_trig = ena && !ena_dly;
-	
-	
-		//
-		// Ready Flag Logic
-		//
-	reg rdy_reg = 1'b1;
-	assign rdy = rdy_reg;
-
-   always @(posedge clk or negedge rst_n)
-		
-			/* reset flag */
-		if (rst_n == 1'b0) rdy_reg <= 1'b1;
-		else begin
-		
-				/* clear flag when operation is started */
-			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
-			
-				/* set flag after operation is finished */
-			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
-			
-		end
-		
-		
-		//
-		// Parameters Latch
-		//
-	reg	[OPERAND_ADDR_WIDTH-1:0]	ab_num_words_latch;
-
-		/* save number of words in a and b when new operation starts */
-	always @(posedge clk)
-		//
-		if (fsm_next_state == FSM_STATE_LOAD_B_START)
-			ab_num_words_latch <= ab_num_words;
-			
-			
-		//
-		// Systolic Cycle Counters
-		//
-		
-		/* handy values */
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-	
-		/* counters */
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init;
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load;
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload;
-		
-		/* handy increment values */
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init_next		= syst_cnt_init   + 1'b1;
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_next		= syst_cnt_load   + 1'b1;
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload_next		= syst_cnt_unload + 1'b1;
-
-		/* handy stop flags */
-	wire										syst_cnt_init_done		= (syst_cnt_init   == syst_cnt_last) ? 1'b1 : 1'b0;
-	wire										syst_cnt_load_done		= (syst_cnt_load   == syst_cnt_last) ? 1'b1 : 1'b0;
-	wire										syst_cnt_unload_done		= (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
-
-		/* delayed load counter */
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_dly;
-	always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
-
-
-		//
-		// Multiplier Iteration Counter
-		//
-		
-		/* handy values */
-	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
-	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
-	
-		/* counter */
-	reg	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt;
-	
-		/* handy increment value and stop flag */
-	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_next = mult_cnt + 1'b1;
-	wire										mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
-			
-			
-		//
-		// Initialization Counter Control Logic
-		//
-	always @(posedge clk) begin
-		//
-		case (fsm_state)
-			FSM_STATE_LOAD_B_START,
-			FSM_STATE_LOAD_N_COEFF_START,
-			FSM_STATE_LOAD_N_START:				mult_cnt <= mult_cnt_zero;
-			
-			FSM_STATE_LOAD_B_SHIFT,
-			FSM_STATE_LOAD_N_COEFF_SHIFT,
-			FSM_STATE_LOAD_N_SHIFT:				mult_cnt <= mult_cnt_next;
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_LOAD_B_START,
-			FSM_STATE_LOAD_N_COEFF_START,
-			FSM_STATE_LOAD_N_START:				syst_cnt_init <= syst_cnt_zero;
-			
-			FSM_STATE_LOAD_B_WRITE,
-			FSM_STATE_LOAD_N_COEFF_WRITE,
-			FSM_STATE_LOAD_N_WRITE:				syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
-		endcase
-		//
-	end
-	
-	
-		//
-		// Operand Loader
-		//
-	
-		/*
-		 * Explain how parallelized loader works here...
-		 *
-		 */
-	
-		/* loader banks */
-	localparam	[ 1: 0]	LOADER_ADDR_MSB_B				= 2'd0;
-	localparam	[ 1: 0]	LOADER_ADDR_MSB_N_COEFF		= 2'd1;
-	localparam	[ 1: 0]	LOADER_ADDR_MSB_N				= 2'd2;
-	
-		/* loader input */
-	reg	[                  2-1:0]	loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];	
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
-	reg										loader_wren    [0:SYSTOLIC_ARRAY_LENGTH-1];
-	reg	[                 32-1:0]	loader_din     [0:SYSTOLIC_ARRAY_LENGTH-1];
-	
-		/* loader output */
-	wire	[                 32-1:0]	loader_dout    [0:SYSTOLIC_ARRAY_LENGTH-1];
-			
-		/* generate parallelized loader */
-		
-		//
-		// Loader currently stores B, N_COEFF and N, it can be coded another way
-		// to initially store B, then AB, then Q. Some memory can be saved thay way.
-		// Maybe later...
-		//
-		
-	genvar i;
-	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
-		//
-		begin : gen_bram_1rw_readfirst_loader
-			//
-			bram_1rw_readfirst #
-			(
-				.MEM_WIDTH		(32),
-				.MEM_ADDR_BITS	(SYSTOLIC_CNTR_WIDTH + 2)
-			)
-			bram_loader
-			(
-				.clk		(clk),
-				.a_addr	({loader_addr_msb[i], loader_addr_lsb[i]}),
-				.a_wr		(loader_wren[i]),
-				.a_in		(loader_din[i]),
-				.a_out	(loader_dout[i])
-			);
-			//
-		end
-		//
-	endgenerate
-	
-
-		//
-		// Block Memory Addresses
-		//
-		
-		/*
-		 * Explain why there are two memory sizes.
-		 *
-		 */
-		
-		/* the very first addresses */
-	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero			= {      {OPERAND_ADDR_WIDTH{1'b0}}};
-	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_zero	= {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
-	
-		/* the very last addresses */
-	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last     = {ab_num_words_latch};
-	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_last = {ab_num_words_latch, 1'b1};
-
-		/* address registers */
-	reg	[OPERAND_ADDR_WIDTH-1:0]	a_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr;
-	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	q_addr;
-	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	s_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr;
-		
-		/* handy increment values */
-	wire	[OPERAND_ADDR_WIDTH-1:0]	a_addr_next			= a_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next			= b_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_next	= n_coeff_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next			= n_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_next	= ab_addr_ext  + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_next			= q_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_next	= qn_addr_ext  + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	s_addr_next			= s_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_addr_next		= sn_addr      + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next			= r_addr       + 1'b1;
-	
-		/* handy stop flags */
-	wire	a_addr_done			= (a_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	b_addr_done			= (b_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	n_coeff_addr_done	= (n_coeff_addr  == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	n_addr_done			= (n_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	ab_addr_ext_done	= (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
-	wire	q_addr_done			= (q_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	qn_addr_ext_done	= (qn_addr_ext     == bram_addr_ext_last)     ? 1'b1 : 1'b0;
-	wire	s_addr_done	= (s_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	sn_addr_done	= (sn_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	r_addr_done	= (r_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
-
-		/* delayed B address */
-	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
-	always @(posedge clk) b_addr_dly <= b_addr;
-
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_dly;
-	always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
-
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr_dly;
-	always @(posedge clk) n_addr_dly <= n_addr;
-				
-		/* map registers to top-level ports */
-	assign a_bram_addr = a_addr;
-	assign b_bram_addr = b_addr;
-	assign n_coeff_bram_addr = n_coeff_addr;
-	assign n_bram_addr = n_addr;
-	assign r_bram_addr = r_addr;
-
-
-		//
-		// Flag
-		//
-	reg	flag_select_s;
-	
-	
-		//
-		// Memory Address Control Logic
-		//
-	always @(posedge clk) begin
-		//
-		case (fsm_next_state)
-			FSM_STATE_LOAD_B_START:				b_addr <= bram_addr_zero;
-			FSM_STATE_LOAD_N_COEFF_START:		n_coeff_addr <= bram_addr_zero;
-			FSM_STATE_LOAD_N_START:				n_addr <= bram_addr_zero;
-			
-			FSM_STATE_LOAD_B_SHIFT:				b_addr <= b_addr_next;
-			FSM_STATE_LOAD_N_COEFF_SHIFT:		n_coeff_addr <= n_coeff_addr_next;
-			FSM_STATE_LOAD_N_SHIFT:				n_addr <= n_addr_next;
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_Q_N_RELOAD: 
-				if (qn_addr_ext == {1'b0, bram_addr_last})
-					n_addr		<= bram_addr_zero;
-				else if (qn_addr_ext > {1'b0, bram_addr_last})
-					n_addr		<= n_addr_next;
-			
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_SAVE_START:	r_addr <= bram_addr_zero;
-			FSM_STATE_SAVE_WRITE:	r_addr <= r_addr_next;
-		endcase
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_A_B_START:	a_addr <= bram_addr_zero;
-			FSM_STATE_MULT_A_B_RELOAD:	a_addr <= !a_addr_done ? a_addr_next : a_addr;
-		endcase
-		//
-	end
-	
-	
-		//
-		// Internal Memories
-		//
-
-		/* memory inputs */
-	reg	[31: 0]	ab_data_in;
-	reg	[31: 0]	q_data_in;
-	reg	[31: 0]	qn_data_in;
-	wire	[31: 0]	s_data_in;
-	wire	[31: 0]	sn_data_in;
-	reg	[31: 0]	r_data_in;
-
-		/* memory outputs */
-	wire	[31: 0]	ab_data_out;
-	wire	[31: 0]	q_data_out;
-	wire	[31: 0]	qn_data_out;
-	wire	[31: 0]	s_data_out;
-	wire	[31: 0]	sn_data_out;
-
-		/* write enables */
-	reg	ab_wren;
-	reg	q_wren;
-	reg	qn_wren;
-	reg	s_wren;
-	reg	sn_wren;
-	reg	r_wren;
-	
-		/* map */
-	assign r_bram_in = r_data_in;
-	assign r_bram_wr = r_wren;
-
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
-	bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
-
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
-	
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
-	bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
-
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
-
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
-
-	
-		//
-		// Wide Operand Loader
-		//
-	integer j;
-	
-		/* shift logic */
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			//
-			FSM_STATE_LOAD_B_SHIFT: begin
-		
-						/* update the rightmost part of loader buffer */
-				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
-				
-						/* shift the loader buffer to the left */
-				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_din[j-1] <= loader_din[j];
-					
-			end
-			//
-			FSM_STATE_LOAD_N_COEFF_SHIFT: begin
-		
-						/* update the rightmost part of loader buffer */
-				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
-				
-						/* shift the loader buffer to the left */
-				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_din[j-1] <= loader_din[j];
-					
-			end
-			//
-			FSM_STATE_LOAD_N_SHIFT: begin
-		
-						/* update the rightmost part of loader buffer */
-				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
-				
-						/* shift the loader buffer to the left */
-				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_din[j-1] <= loader_din[j];
-					
-			end					
-			//
-		endcase
-		
-
-		/* write enable logic */
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-		
-			FSM_STATE_LOAD_B_WRITE,
-			FSM_STATE_LOAD_N_COEFF_WRITE,
-			FSM_STATE_LOAD_N_WRITE:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_wren[j] <= 1'b1;
-					
-			default:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_wren[j] <= 1'b0;
-					
-		endcase
-
-		/* loader address update logic */
-	always @(posedge clk) begin
-		//
-		case (fsm_state)
-		
-			FSM_STATE_LOAD_B_START,
-			FSM_STATE_LOAD_N_COEFF_START,
-			FSM_STATE_LOAD_N_START:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_lsb[j] <= syst_cnt_zero;
-					
-			FSM_STATE_LOAD_B_WRITE,
-			FSM_STATE_LOAD_N_COEFF_WRITE,
-			FSM_STATE_LOAD_N_WRITE:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
-					
-		endcase
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START,
-			FSM_STATE_MULT_A_B_RELOAD,
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
-			FSM_STATE_MULT_Q_N_RELOAD:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_lsb[j] <= syst_cnt_zero;
-													
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
-		endcase
-		//
-		case (fsm_next_state)
-		
-			FSM_STATE_LOAD_B_START,
-			FSM_STATE_MULT_A_B_START:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
-
-			FSM_STATE_LOAD_N_COEFF_START,
-			FSM_STATE_MULT_AB_N_COEFF_START:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
-					
-			FSM_STATE_LOAD_N_START,
-			FSM_STATE_MULT_Q_N_START:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
-
-		endcase
-		//
-	end
-	
-	
-		//
-		// Systolic Array of Processing Elements
-		//
-	reg	[31: 0]	pe_a        [0:SYSTOLIC_ARRAY_LENGTH-1];
-	reg	[31: 0]	pe_b        [0:SYSTOLIC_ARRAY_LENGTH-1];
-	wire	[31: 0]	pe_t        [0:SYSTOLIC_ARRAY_LENGTH-1];
-	wire	[31: 0]	pe_c_in     [0:SYSTOLIC_ARRAY_LENGTH-1];
-	wire	[31: 0]	pe_p        [0:SYSTOLIC_ARRAY_LENGTH-1];
-	wire	[31: 0]	pe_c_out    [0:SYSTOLIC_ARRAY_LENGTH-1];
-	reg	[31: 0]	pe_c_out_dly[0:SYSTOLIC_ARRAY_LENGTH-1];
-	
-
-		//
-		// These can be turned into a FIFO (maybe later?)...
-		//
-	//reg	[31: 0]	pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
-	//reg	[31: 0]	pe_t_mem    [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
-
-	reg	fifo_c_rst;
-	reg	fifo_t_rst;
-
-	wire	fifo_c_wren;
-	wire	fifo_c_rden;
-	
-	wire	fifo_t_wren;
-	wire	fifo_t_rden;
-		
-	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_c_din;
-	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_c_dout;
-	
-	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_t_din;
-	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_t_dout;
-	
-	/**/
-	modexpa7_simple_fifo #
-	(
-		.BUS_WIDTH	(32 * SYSTOLIC_ARRAY_LENGTH),
-		.DEPTH_BITS	(SYSTOLIC_CNTR_WIDTH)
-	)
-	fifo_c
-	(
-		.clk			(clk),
-		.rst			(fifo_c_rst),
-		.wr_en		(fifo_c_wren),
-		.d_in			(fifo_c_din),
-		.rd_en		(fifo_c_rden),
-		.d_out		(fifo_c_dout)
-	);
-	
-	modexpa7_simple_fifo #
-	(
-		.BUS_WIDTH	(32 * SYSTOLIC_ARRAY_LENGTH),
-		.DEPTH_BITS	(SYSTOLIC_CNTR_WIDTH)
-	)
-	fifo_t
-	(
-		.clk			(clk),
-		.rst			(fifo_t_rst),
-		.wr_en		(fifo_t_wren),
-		.d_in			(fifo_t_din),
-		.rd_en		(fifo_t_rden),
-		.d_out		(fifo_t_dout)
-	);
-	
-	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
-		begin : modexpa7_systolic_pe_multiplier		
-			modexpa7_systolic_pe systolic_pe_inst
-			(
-				.clk		(clk),
-				.a			(pe_a[i]),
-				.b			(pe_b[i]),
-				.t			(pe_t[i]),
-				.c_in		(pe_c_in[i]),
-				.p			(pe_p[i]),
-				.c_out	(pe_c_out[i])
-			);
-			assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
-			assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
-			assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
-			always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
-		end
-	endgenerate
-
-
-
-		
-			
-			//
-			// Shift Registers
-			//
-	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_load;
-	reg	[SYSTOLIC_PE_LATENCY  :0]	shreg_latency;
-	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_unload;
-
-	wire	shreg_done_load = shreg_load[syst_cnt_last];
-	wire	shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
-	wire	shreg_done_unload = shreg_unload[syst_cnt_last];
-
-	reg										shreg_now_loading;
-	reg										shreg_now_latency;
-	reg										shreg_now_unloading;
-	
-	reg										shreg_done_latency_dly;
-	
-	always @(posedge clk)
-		shreg_done_latency_dly <= shreg_done_latency;
-
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_LOAD_N_FINAL: begin
-				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
-				shreg_latency	<= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
-				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
-			end
-			//
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START,
-			FSM_STATE_MULT_A_B_RELOAD,
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
-			FSM_STATE_MULT_Q_N_RELOAD: begin
-				shreg_now_loading	<= 1'b1;
-				shreg_now_latency <= 1'b1;
-				shreg_now_unloading <= 1'b0;
-				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
-				shreg_latency	<= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
-				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
-			end
-			//
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH: begin
-				shreg_load		<= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
-				shreg_latency	<= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
-				shreg_unload	<= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
-				
-				if (shreg_done_load) shreg_now_loading <= 1'b0;
-				if (shreg_done_latency) shreg_now_latency <= 1'b0;
-				if (shreg_done_latency) shreg_now_unloading <= 1'b1;
-				else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
-				
-			end
-			//
-			default: begin
-				shreg_now_loading <= 1'b0;
-				shreg_now_latency <= 1'b0;
-				shreg_now_unloading <= 1'b0;
-			end
-			//
-		endcase
-		
-		
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START:			fifo_c_rst <= 1'b1;
-			
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH:		if (shreg_done_load)	fifo_c_rst <= 1'b0;
-		endcase
-
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START:			fifo_t_rst <= 1'b1;
-			
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH:		if (shreg_done_load)	fifo_t_rst <= 1'b0;
-		endcase
-
-
-	reg	[32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0]	pe_p_msb_dly;
-	
-	always @(posedge clk)
-		//
-		for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-			pe_p_msb_dly[32 * j - 1 -: 32] <= pe_p[j];
-			
-	wire	[31: 0]	pe_p_lsb_masked = shreg_now_unloading ? pe_p[0] : 32'd0;
-	assign fifo_t_din = {pe_p_lsb_masked, pe_p_msb_dly};
-
-	
-	
-	reg shreg_now_unloading_dly;
-	always @(posedge clk)
-		shreg_now_unloading_dly <= shreg_now_unloading;
-	
-	assign fifo_c_wren = shreg_now_unloading_dly;
-	assign fifo_c_rden = shreg_now_loading;
-	
-	assign fifo_t_wren = shreg_now_unloading_dly;	
-	assign fifo_t_rden = shreg_now_loading;
-	
-	
-		
-		
-	always @(posedge clk) begin
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_A_B_START:				ab_addr_ext		<= bram_addr_ext_zero;
-			FSM_STATE_MULT_AB_N_COEFF_START:		q_addr			<= bram_addr_zero;
-			FSM_STATE_MULT_Q_N_START: begin		qn_addr_ext		<= bram_addr_ext_zero;
-															ab_addr_ext		<= bram_addr_ext_zero;															
-															end
-			
-			FSM_STATE_MULT_A_B_RELOAD:				ab_addr_ext		<= ab_addr_ext_next;
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	q_addr			<= q_addr_next;
-			FSM_STATE_MULT_Q_N_RELOAD: begin		qn_addr_ext		<= qn_addr_ext_next;
-															ab_addr_ext		<= ab_addr_ext_next;
-															end
-		endcase
-		//
-		case (fsm_state)
-		
-			FSM_STATE_MULT_Q_N_RELOAD: begin
-				if (qn_addr_ext == {1'b0, bram_addr_last}) begin
-					s_addr	<= bram_addr_zero;
-					sn_addr	<= bram_addr_zero;
-				end
-				
-				if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
-					s_addr <= s_addr_next;
-					sn_addr <= sn_addr_next;
-				end
-
-				if (qn_addr_ext == bram_addr_ext_last) begin
-					s_addr <= bram_addr_zero;
-					sn_addr <= bram_addr_zero;
-				end
-			
-			end
-			
-			FSM_STATE_MULT_Q_N_FINAL,
-			FSM_STATE_SAVE_START,
-			FSM_STATE_SAVE_WRITE: begin
-				s_addr <= !s_addr_done ? s_addr_next : s_addr;
-				sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
-			end
-			
-		endcase
-		
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_AB_N_COEFF_START:		ab_addr_ext <= bram_addr_ext_zero;
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	ab_addr_ext <= ab_addr_ext_next;
-		endcase
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_Q_N_START:		q_addr <= bram_addr_zero;
-			FSM_STATE_MULT_Q_N_RELOAD:		q_addr <= !q_addr_done ? q_addr_next : q_addr;
-		endcase
-
-		//
-	end
-		
-	always @(posedge clk) begin
-		//
-		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
-			ab_wren <= shreg_done_latency_dly;
-			ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
-		end else begin
-			ab_wren <= 1'b0;
-			ab_data_in <= 32'hXXXXXXXX;
-		end
-		//
-		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
-			q_wren <= shreg_done_latency_dly;
-			q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
-		end else begin
-			q_wren <= 1'b0;
-			q_data_in <= 32'hXXXXXXXX;
-		end
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
-			qn_wren <= shreg_done_latency_dly;
-			qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
-		end else begin
-			qn_wren <= 1'b0;
-			qn_data_in <= 32'hXXXXXXXX;
-		end		
-		//
-		case (fsm_state)
-			FSM_STATE_SAVE_START:	r_wren <= 1'b1;
-			FSM_STATE_SAVE_WRITE:	r_wren <= ~r_addr_done;
-			default:						r_wren <= 1'b0;
-		endcase
-		//
-	end
-	
-	
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START,
-			FSM_STATE_MULT_A_B_RELOAD,
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
-			FSM_STATE_MULT_Q_N_RELOAD:
-				//
-				syst_cnt_load <= syst_cnt_zero;
-			
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH:
-				//
-				syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-				
-		endcase
-
-		
-		
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH: begin
-		
-			if (shreg_done_latency)	syst_cnt_unload <= syst_cnt_zero;
-			else if (shreg_now_unloading)
-				syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
-
-			end
-		endcase
-	
-		
-			//
-			// T and C_IN can be moved to a separate code block
-			//
-	always @(posedge clk) begin
-		//
-		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
-			//
-			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-				//
-				if (shreg_now_loading) begin
-					pe_a[j]		<= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
-					pe_b[j]		<= loader_dout[j];
-					//pe_t[j]		<= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
-					//pe_c_in[j]	<= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
-				end else begin
-					pe_a[j]		<= 32'hXXXXXXXX;				
-					pe_b[j]		<= 32'hXXXXXXXX;
-					//pe_t[j]		<= 32'hXXXXXXXX;
-					//pe_c_in[j]	<= 32'hXXXXXXXX;
-				end
-		//
-		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
-			//
-			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-				//
-				if (shreg_now_loading) begin
-					pe_a[j]		<= ab_data_out;
-					pe_b[j]		<= loader_dout[j];
-					//pe_t[j]		<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
-					//pe_c_in[j]	<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
-				end else begin
-					pe_a[j]		<= 32'hXXXXXXXX;				
-					pe_b[j]		<= 32'hXXXXXXXX;
-					//pe_t[j]		<= 32'hXXXXXXXX;
-					//pe_c_in[j]	<= 32'hXXXXXXXX;
-				end
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			//
-			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-				//
-				if (shreg_now_loading) begin
-					pe_a[j]		<= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
-					pe_b[j]		<= loader_dout[j];
-					//pe_t[j]		<= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
-					//pe_c_in[j]	<= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
-				end else begin
-					pe_a[j]		<= 32'hXXXXXXXX;				
-					pe_b[j]		<= 32'hXXXXXXXX;
-					//pe_t[j]		<= 32'hXXXXXXXX;
-					//pe_c_in[j]	<= 32'hXXXXXXXX;
-				end
-		//
-	
-		//
-	end
-		
-		
-		//
-		// Adder
-		//
-		/*
-		 * This adder is used to calculate S = AB + QN.
-		 *
-		 */
-	reg				add1_ce;					// clock enable
-	reg	[31: 0]	add1_s;					// sum output
-	wire				add1_c_in;				// carry input
-	wire	[31: 0]	add1_a;					// A-input
-	reg	[31: 0]	add1_b;					// B-input
-	reg				add1_c_in_mask;		// flag to not carry anything into the very first word
-	reg				add1_c_out;				// carry output
-	
-		/* add masking into carry feedback chain */
-	assign add1_c_in = add1_c_out & ~add1_c_in_mask;
-
-		/* mask carry for the very first word of N */
-	//always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
-	
-	always @(posedge  clk)
-		//
-		if (add1_ce)
-			//
-			{add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in};
-	
-	assign add1_a = qn_data_in;
-	
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX;
-		else
-			add1_b <= 32'hXXXXXXXX;
-
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0;
-		else
-			add1_c_in_mask <= 1'b0;
-
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			add1_ce <= shreg_done_latency_dly;
-		else
-			add1_ce <= 1'b0;
-
-
-	assign s_data_in = add1_s;
-	assign sn_data_in = sub1_d;
-	
-	always @(posedge clk) begin
-		//
-		s_wren <= add1_ce;
-		sn_wren <= sub1_ce;
-	end
-		
-		
-		
-		//
-		// Subtractor
-		//
-		/*
-		 * This subtractor is used to calculate SN = S - N.
-		 *
-		 */
-	reg				sub1_ce;					// clock enable
-	reg	[31: 0]	sub1_d;					// difference output
-	wire				sub1_b_in;				// borrow input
-	wire	[31: 0]	sub1_a;					// A-input
-	reg	[31: 0]	sub1_b;					// B-input
-	reg				sub1_b_in_mask;		// flag to not borrow anything from the very first word
-	reg				sub1_b_out;				// borrow output
-	
-		/* add masking into borrow feedback chain */
-	assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
-	
-	always @(posedge  clk)
-		//
-		if (sub1_ce)
-			//
-			{sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
-	
-	assign sub1_a = add1_s;
-	
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
-		else
-			sub1_b <= 32'hXXXXXXXX;
-
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
-		else
-			sub1_b_in_mask <= 1'b0;
-
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
-		else
-			sub1_ce <= 1'b0;
-
-
-	assign s_data_in = add1_s;
-	
-	always @(posedge clk)
-		//
-		s_wren <= add1_ce;
-		
-		
-
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
-			flag_select_s <= sub1_b_out & ~add1_c_out;
-		
-
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_SAVE_START,
-			FSM_STATE_SAVE_WRITE:
-				r_data_in <= flag_select_s ? s_data_out : sn_data_out;
-		endcase
-
-		
-			
-		//
-		// FSM Process
-		//
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
-		else						fsm_state <= fsm_next_state;
-	
-	
-		//
-		// FSM Transition Logic
-		//
-	always @* begin
-		//
-		fsm_next_state = FSM_STATE_STOP;
-		//
-		case (fsm_state)
-
-			FSM_STATE_IDLE:				if (ena_trig)				fsm_next_state = FSM_STATE_LOAD_B_START;
-												else							fsm_next_state = FSM_STATE_IDLE;
-			//
-			FSM_STATE_LOAD_B_START:											fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
-			FSM_STATE_LOAD_B_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_B_WRITE;
-												else								fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
-			FSM_STATE_LOAD_B_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_B_FINAL;
-												else							fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
-			FSM_STATE_LOAD_B_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
-			//
-			FSM_STATE_LOAD_N_COEFF_START:											fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
-			FSM_STATE_LOAD_N_COEFF_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
-												else								fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
-			FSM_STATE_LOAD_N_COEFF_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
-												else							fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
-			FSM_STATE_LOAD_N_COEFF_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_START;
-			//
-			FSM_STATE_LOAD_N_START:											fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
-			FSM_STATE_LOAD_N_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_WRITE;
-												else								fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
-			FSM_STATE_LOAD_N_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_FINAL;
-												else							fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
-			FSM_STATE_LOAD_N_FINAL:										fsm_next_state = FSM_STATE_MULT_A_B_START;
-			//
-			FSM_STATE_MULT_A_B_START:									fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
-			FSM_STATE_MULT_A_B_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
-												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
-			FSM_STATE_MULT_A_B_RELOAD:	if (ab_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
-												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
-			FSM_STATE_MULT_A_B_FINAL:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
-			//
-			FSM_STATE_MULT_AB_N_COEFF_START:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
-															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	if (q_addr_done)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
-															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
-			FSM_STATE_MULT_AB_N_COEFF_FINAL:									fsm_next_state = FSM_STATE_MULT_Q_N_START;
-			//
-			FSM_STATE_MULT_Q_N_START:									fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
-			FSM_STATE_MULT_Q_N_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
-															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
-			FSM_STATE_MULT_Q_N_RELOAD:	if (qn_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
-															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
-			FSM_STATE_MULT_Q_N_FINAL:									fsm_next_state = FSM_STATE_SAVE_START;
-			//
-			FSM_STATE_SAVE_START:										fsm_next_state = FSM_STATE_SAVE_WRITE;
-			FSM_STATE_SAVE_WRITE:	if (r_addr_done)				fsm_next_state = FSM_STATE_SAVE_FINAL;
-											else								fsm_next_state = FSM_STATE_SAVE_WRITE;
-			FSM_STATE_SAVE_FINAL:										fsm_next_state = FSM_STATE_STOP;
-			//
-			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
-
-		endcase
-		//
-	end
-
-
-endmodule
-
-//======================================================================
-// End of file
-//======================================================================
diff --git a/src/rtl/modexpa7_systolic_multiplier_old.v b/src/rtl/modexpa7_systolic_multiplier_old.v
deleted file mode 100644
index 8b00370..0000000
--- a/src/rtl/modexpa7_systolic_multiplier_old.v
+++ /dev/null
@@ -1,1260 +0,0 @@
-//======================================================================
-//
-// modexpa7_systolic_multiplier.v
-// -----------------------------------------------------------------------------
-// Systolic Montgomery multiplier.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2017, NORDUnet A/S All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may
-//   be used to endorse or promote products derived from this software
-//   without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//======================================================================
-
-module modexpa7_systolic_multiplier #
-	(
-			//
-			// This sets the address widths of memory buffers. Internal data
-			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
-			// 2048 / 32 = 64 words, and these need 6-bit address bus, because
-			// 2 ** 6 = 64.
-			//
-		parameter	OPERAND_ADDR_WIDTH		= 4,
-		
-			//
-			// Explain.
-			//
-		parameter	SYSTOLIC_ARRAY_POWER		= 1
-	)
-	(
-		input											clk,
-		input											rst_n,
-
-		input											ena,
-		output										rdy,
-
-		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
-		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
-		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
-		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
-		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,
-
-		input		[                32-1:0]	a_bram_out,
-		input		[                32-1:0]	b_bram_out,
-		input		[                32-1:0]	n_bram_out,
-		input		[                32-1:0]	n_coeff_bram_out,
-
-		output	[                32-1:0]	r_bram_in,
-		output										r_bram_wr,
-
-		input		[OPERAND_ADDR_WIDTH-1:0]	ab_num_words
-	);
-	
-		
-		//
-		// Include Settings
-		//
-	`include "pe/modexpa7_primitive_switch.v"
-	`include "modexpa7_settings.v"
-		
-
-		//
-		// FSM Declaration
-		//
-	localparam	[ 7: 0]	FSM_STATE_IDLE								= 8'h00;
-
-	localparam	[ 7: 0]	FSM_STATE_LOAD_B_START					= 8'h11;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_B_SHIFT					= 8'h12;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_B_WRITE					= 8'h13;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_B_FINAL					= 8'h14;
-
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_START			= 8'h21;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_SHIFT			= 8'h22;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_WRITE			= 8'h23;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_FINAL			= 8'h24;
-
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_START					= 8'h31;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_SHIFT					= 8'h32;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_WRITE					= 8'h33;
-	localparam	[ 7: 0]	FSM_STATE_LOAD_N_FINAL					= 8'h34;
-
-	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_START				= 8'h41;
-	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_CRUNCH				= 8'h42;
-	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_RELOAD				= 8'h43;
-	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_FINAL				= 8'h44;
-
-	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_START		= 8'h51;
-	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_CRUNCH		= 8'h52;
-	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_RELOAD		= 8'h53;
-	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_FINAL		= 8'h54;
-
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_START				= 8'h61;
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_CRUNCH				= 8'h62;
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_ADD_S				= 8'h63;
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_SUB_SN				= 8'h64;
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_RELOAD				= 8'h65;
-	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_FINAL				= 8'h66;
-	
-	localparam	[ 7: 0]	FSM_STATE_SAVE_START						= 8'h71;
-	localparam	[ 7: 0]	FSM_STATE_SAVE_WRITE						= 8'h72;
-	localparam	[ 7: 0]	FSM_STATE_SAVE_FINAL						= 8'h73;	
-	
-	localparam	[ 7: 0]	FSM_STATE_STOP								= 8'hFF;
-	
-		//
-		// FSM State / Next State
-		//
-	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
-	reg	[ 7: 0]	fsm_next_state;
-
-
-		//
-		// Enable Delay and Trigger
-		//
-   reg ena_dly = 1'b0;
-	
-		/* delay enable by one clock cycle */
-   always @(posedge clk) ena_dly <= ena;
-
-		/* trigger new operation when enable goes high */
-   wire ena_trig = ena && !ena_dly;
-	
-	
-		//
-		// Ready Flag Logic
-		//
-	reg rdy_reg = 1'b1;
-	assign rdy = rdy_reg;
-
-   always @(posedge clk or negedge rst_n)
-		
-			/* reset flag */
-		if (rst_n == 1'b0) rdy_reg <= 1'b1;
-		else begin
-		
-				/* clear flag when operation is started */
-			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
-			
-				/* set flag after operation is finished */
-			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
-			
-		end
-		
-		
-		//
-		// Parameters Latch
-		//
-	reg	[OPERAND_ADDR_WIDTH-1:0]	ab_num_words_latch;
-
-		/* save number of words in a and b when new operation starts */
-	always @(posedge clk)
-		//
-		if (fsm_next_state == FSM_STATE_LOAD_B_START)
-			ab_num_words_latch <= ab_num_words;
-			
-			
-		//
-		// Systolic Cycle Counters
-		//
-		
-		/* handy values */
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-	
-		/* counters */
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init;
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load;
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload;
-		
-		/* handy increment values */
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init_next		= syst_cnt_init   + 1'b1;
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_next		= syst_cnt_load   + 1'b1;
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload_next		= syst_cnt_unload + 1'b1;
-
-		/* handy stop flags */
-	wire										syst_cnt_init_done		= (syst_cnt_init   == syst_cnt_last) ? 1'b1 : 1'b0;
-	wire										syst_cnt_load_done		= (syst_cnt_load   == syst_cnt_last) ? 1'b1 : 1'b0;
-	wire										syst_cnt_unload_done		= (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
-
-		/* delayed load counter */
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_dly;
-	always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
-
-
-		//
-		// Multiplier Iteration Counter
-		//
-		
-		/* handy values */
-	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
-	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
-	
-		/* counter */
-	reg	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt;
-	
-		/* handy increment value and stop flag */
-	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_next = mult_cnt + 1'b1;
-	wire										mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
-			
-			
-		//
-		// Initialization Counter Control Logic
-		//
-	always @(posedge clk) begin
-		//
-		case (fsm_state)
-			FSM_STATE_LOAD_B_START,
-			FSM_STATE_LOAD_N_COEFF_START,
-			FSM_STATE_LOAD_N_START:				mult_cnt <= mult_cnt_zero;
-			
-			FSM_STATE_LOAD_B_SHIFT,
-			FSM_STATE_LOAD_N_COEFF_SHIFT,
-			FSM_STATE_LOAD_N_SHIFT:				mult_cnt <= mult_cnt_next;
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_LOAD_B_START,
-			FSM_STATE_LOAD_N_COEFF_START,
-			FSM_STATE_LOAD_N_START:				syst_cnt_init <= syst_cnt_zero;
-			
-			FSM_STATE_LOAD_B_WRITE,
-			FSM_STATE_LOAD_N_COEFF_WRITE,
-			FSM_STATE_LOAD_N_WRITE:				syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
-		endcase
-		//
-	end
-	
-	
-		//
-		// Operand Loader
-		//
-	
-		/*
-		 * Explain how parallelized loader works here...
-		 *
-		 */
-	
-		/* loader banks */
-	localparam	[ 1: 0]	LOADER_ADDR_MSB_B				= 2'd0;
-	localparam	[ 1: 0]	LOADER_ADDR_MSB_N_COEFF		= 2'd1;
-	localparam	[ 1: 0]	LOADER_ADDR_MSB_N				= 2'd2;
-	
-		/* loader input */
-	reg	[                  2-1:0]	loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];	
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
-	reg										loader_wren    [0:SYSTOLIC_ARRAY_LENGTH-1];
-	reg	[                 32-1:0]	loader_din     [0:SYSTOLIC_ARRAY_LENGTH-1];
-	
-		/* loader output */
-	wire	[                 32-1:0]	loader_dout    [0:SYSTOLIC_ARRAY_LENGTH-1];
-			
-		/* generate parallelized loader */
-		
-		//
-		// Loader currently stores B, N_COEFF and N, it can be coded another way
-		// to initially store B, then AB, then Q. Some memory can be saved thay way.
-		// Maybe later...
-		//
-		
-	genvar i;
-	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
-		//
-		begin : gen_bram_1rw_readfirst_loader
-			//
-			bram_1rw_readfirst #
-			(
-				.MEM_WIDTH		(32),
-				.MEM_ADDR_BITS	(SYSTOLIC_CNTR_WIDTH + 2)
-			)
-			bram_loader
-			(
-				.clk		(clk),
-				.a_addr	({loader_addr_msb[i], loader_addr_lsb[i]}),
-				.a_wr		(loader_wren[i]),
-				.a_in		(loader_din[i]),
-				.a_out	(loader_dout[i])
-			);
-			//
-		end
-		//
-	endgenerate
-	
-
-		//
-		// Block Memory Addresses
-		//
-		
-		/*
-		 * Explain why there are two memory sizes.
-		 *
-		 */
-		
-		/* the very first addresses */
-	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero			= {      {OPERAND_ADDR_WIDTH{1'b0}}};
-	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_zero	= {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
-	
-		/* the very last addresses */
-	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last     = {ab_num_words_latch};
-	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_last = {ab_num_words_latch, 1'b1};
-
-		/* address registers */
-	reg	[OPERAND_ADDR_WIDTH-1:0]	a_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr;
-	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	q_addr;
-	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	s_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_addr;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr;
-		
-		/* handy increment values */
-	wire	[OPERAND_ADDR_WIDTH-1:0]	a_addr_next			= a_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next			= b_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_next	= n_coeff_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next			= n_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_next	= ab_addr_ext  + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_next			= q_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_next	= qn_addr_ext  + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	s_addr_next			= s_addr       + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_addr_next		= sn_addr      + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next			= r_addr       + 1'b1;
-	
-		/* handy stop flags */
-	wire	a_addr_done			= (a_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	b_addr_done			= (b_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	n_coeff_addr_done	= (n_coeff_addr  == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	n_addr_done			= (n_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	ab_addr_ext_done	= (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
-	wire	q_addr_done			= (q_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	qn_addr_ext_done	= (qn_addr_ext     == bram_addr_ext_last)     ? 1'b1 : 1'b0;
-	wire	s_addr_done	= (s_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	sn_addr_done	= (sn_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
-	wire	r_addr_done	= (r_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
-
-		/* delayed B address */
-	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
-	always @(posedge clk) b_addr_dly <= b_addr;
-
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_dly;
-	always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
-
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr_dly;
-	always @(posedge clk) n_addr_dly <= n_addr;
-				
-		/* map registers to top-level ports */
-	assign a_bram_addr = a_addr;
-	assign b_bram_addr = b_addr;
-	assign n_coeff_bram_addr = n_coeff_addr;
-	assign n_bram_addr = n_addr;
-	assign r_bram_addr = r_addr;
-
-
-		//
-		// Flag
-		//
-	reg	flag_select_s;
-	
-	
-		//
-		// Memory Address Control Logic
-		//
-	always @(posedge clk) begin
-		//
-		case (fsm_next_state)
-			FSM_STATE_LOAD_B_START:				b_addr <= bram_addr_zero;
-			FSM_STATE_LOAD_N_COEFF_START:		n_coeff_addr <= bram_addr_zero;
-			FSM_STATE_LOAD_N_START:				n_addr <= bram_addr_zero;
-			
-			FSM_STATE_LOAD_B_SHIFT:				b_addr <= b_addr_next;
-			FSM_STATE_LOAD_N_COEFF_SHIFT:		n_coeff_addr <= n_coeff_addr_next;
-			FSM_STATE_LOAD_N_SHIFT:				n_addr <= n_addr_next;
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_Q_N_RELOAD: 
-				if (qn_addr_ext == {1'b0, bram_addr_last})
-					n_addr		<= bram_addr_zero;
-				else if (qn_addr_ext > {1'b0, bram_addr_last})
-					n_addr		<= n_addr_next;
-			
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_SAVE_START:	r_addr <= bram_addr_zero;
-			FSM_STATE_SAVE_WRITE:	r_addr <= r_addr_next;
-		endcase
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_A_B_START:	a_addr <= bram_addr_zero;
-			FSM_STATE_MULT_A_B_RELOAD:	a_addr <= !a_addr_done ? a_addr_next : a_addr;
-		endcase
-		//
-	end
-	
-	
-		//
-		// Internal Memories
-		//
-
-		/* memory inputs */
-	reg	[31: 0]	ab_data_in;
-	reg	[31: 0]	q_data_in;
-	reg	[31: 0]	qn_data_in;
-	wire	[31: 0]	s_data_in;
-	wire	[31: 0]	sn_data_in;
-	reg	[31: 0]	r_data_in;
-
-		/* memory outputs */
-	wire	[31: 0]	ab_data_out;
-	wire	[31: 0]	q_data_out;
-	wire	[31: 0]	qn_data_out;
-	wire	[31: 0]	s_data_out;
-	wire	[31: 0]	sn_data_out;
-
-		/* write enables */
-	reg	ab_wren;
-	reg	q_wren;
-	reg	qn_wren;
-	reg	s_wren;
-	reg	sn_wren;
-	reg	r_wren;
-	
-		/* map */
-	assign r_bram_in = r_data_in;
-	assign r_bram_wr = r_wren;
-
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
-	bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
-
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
-	
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
-	bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
-
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
-
-	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
-
-	
-		//
-		// Wide Operand Loader
-		//
-	integer j;
-	
-		/* shift logic */
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			//
-			FSM_STATE_LOAD_B_SHIFT: begin
-		
-						/* update the rightmost part of loader buffer */
-				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
-				
-						/* shift the loader buffer to the left */
-				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_din[j-1] <= loader_din[j];
-					
-			end
-			//
-			FSM_STATE_LOAD_N_COEFF_SHIFT: begin
-		
-						/* update the rightmost part of loader buffer */
-				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
-				
-						/* shift the loader buffer to the left */
-				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_din[j-1] <= loader_din[j];
-					
-			end
-			//
-			FSM_STATE_LOAD_N_SHIFT: begin
-		
-						/* update the rightmost part of loader buffer */
-				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
-				
-						/* shift the loader buffer to the left */
-				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_din[j-1] <= loader_din[j];
-					
-			end					
-			//
-		endcase
-		
-
-		/* write enable logic */
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-		
-			FSM_STATE_LOAD_B_WRITE,
-			FSM_STATE_LOAD_N_COEFF_WRITE,
-			FSM_STATE_LOAD_N_WRITE:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_wren[j] <= 1'b1;
-					
-			default:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_wren[j] <= 1'b0;
-					
-		endcase
-
-		/* loader address update logic */
-	always @(posedge clk) begin
-		//
-		case (fsm_state)
-		
-			FSM_STATE_LOAD_B_START,
-			FSM_STATE_LOAD_N_COEFF_START,
-			FSM_STATE_LOAD_N_START:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_lsb[j] <= syst_cnt_zero;
-					
-			FSM_STATE_LOAD_B_WRITE,
-			FSM_STATE_LOAD_N_COEFF_WRITE,
-			FSM_STATE_LOAD_N_WRITE:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
-					
-		endcase
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START,
-			FSM_STATE_MULT_A_B_RELOAD,
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
-			FSM_STATE_MULT_Q_N_RELOAD:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_lsb[j] <= syst_cnt_zero;
-													
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
-		endcase
-		//
-		case (fsm_next_state)
-		
-			FSM_STATE_LOAD_B_START,
-			FSM_STATE_MULT_A_B_START:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
-
-			FSM_STATE_LOAD_N_COEFF_START,
-			FSM_STATE_MULT_AB_N_COEFF_START:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
-					
-			FSM_STATE_LOAD_N_START,
-			FSM_STATE_MULT_Q_N_START:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
-
-		endcase
-		//
-	end
-	
-	
-		//
-		// Systolic Array of Processing Elements
-		//
-	reg	[31: 0]	pe_a        [0:SYSTOLIC_ARRAY_LENGTH-1];
-	reg	[31: 0]	pe_b        [0:SYSTOLIC_ARRAY_LENGTH-1];
-	wire	[31: 0]	pe_t        [0:SYSTOLIC_ARRAY_LENGTH-1];
-	wire	[31: 0]	pe_c_in     [0:SYSTOLIC_ARRAY_LENGTH-1];
-	wire	[31: 0]	pe_p        [0:SYSTOLIC_ARRAY_LENGTH-1];
-	wire	[31: 0]	pe_c_out    [0:SYSTOLIC_ARRAY_LENGTH-1];
-	reg	[31: 0]	pe_c_out_dly[0:SYSTOLIC_ARRAY_LENGTH-1];
-	
-
-		//
-		// These can be turned into a FIFO (maybe later?)...
-		//
-	//reg	[31: 0]	pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
-	//reg	[31: 0]	pe_t_mem    [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
-
-	reg	fifo_c_rst;
-	reg	fifo_t_rst;
-
-	wire	fifo_c_wren;
-	wire	fifo_c_rden;
-	
-	wire	fifo_t_wren;
-	wire	fifo_t_rden;
-		
-	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_c_din;
-	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_c_dout;
-	
-	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_t_din;
-	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_t_dout;
-	
-	/**/
-	modexpa7_simple_fifo #
-	(
-		.BUS_WIDTH	(32 * SYSTOLIC_ARRAY_LENGTH),
-		.DEPTH_BITS	(SYSTOLIC_CNTR_WIDTH)
-	)
-	fifo_c
-	(
-		.clk			(clk),
-		.rst			(fifo_c_rst),
-		.wr_en		(fifo_c_wren),
-		.d_in			(fifo_c_din),
-		.rd_en		(fifo_c_rden),
-		.d_out		(fifo_c_dout)
-	);
-	
-	modexpa7_simple_fifo #
-	(
-		.BUS_WIDTH	(32 * SYSTOLIC_ARRAY_LENGTH),
-		.DEPTH_BITS	(SYSTOLIC_CNTR_WIDTH)
-	)
-	fifo_t
-	(
-		.clk			(clk),
-		.rst			(fifo_t_rst),
-		.wr_en		(fifo_t_wren),
-		.d_in			(fifo_t_din),
-		.rd_en		(fifo_t_rden),
-		.d_out		(fifo_t_dout)
-	);
-	
-	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
-		begin : modexpa7_systolic_pe_multiplier		
-			modexpa7_systolic_pe systolic_pe_inst
-			(
-				.clk		(clk),
-				.a			(pe_a[i]),
-				.b			(pe_b[i]),
-				.t			(pe_t[i]),
-				.c_in		(pe_c_in[i]),
-				.p			(pe_p[i]),
-				.c_out	(pe_c_out[i])
-			);
-			assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
-			assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
-			assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
-			always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
-		end
-	endgenerate
-
-
-
-		
-			
-			//
-			// Shift Registers
-			//
-	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_load;
-	reg	[SYSTOLIC_PE_LATENCY  :0]	shreg_latency;
-	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_unload;
-
-	wire	shreg_done_load = shreg_load[syst_cnt_last];
-	wire	shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
-	wire	shreg_done_unload = shreg_unload[syst_cnt_last];
-
-	reg										shreg_now_loading;
-	reg										shreg_now_latency;
-	reg										shreg_now_unloading;
-	
-	reg										shreg_done_latency_dly;
-	
-	always @(posedge clk)
-		shreg_done_latency_dly <= shreg_done_latency;
-
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_LOAD_N_FINAL: begin
-				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
-				shreg_latency	<= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
-				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
-			end
-			//
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START,
-			FSM_STATE_MULT_A_B_RELOAD,
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
-			FSM_STATE_MULT_Q_N_RELOAD: begin
-				shreg_now_loading	<= 1'b1;
-				shreg_now_latency <= 1'b1;
-				shreg_now_unloading <= 1'b0;
-				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
-				shreg_latency	<= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
-				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
-			end
-			//
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH: begin
-				shreg_load		<= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
-				shreg_latency	<= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
-				shreg_unload	<= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
-				
-				if (shreg_done_load) shreg_now_loading <= 1'b0;
-				if (shreg_done_latency) shreg_now_latency <= 1'b0;
-				if (shreg_done_latency) shreg_now_unloading <= 1'b1;
-				else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
-				
-			end
-			//
-			default: begin
-				shreg_now_loading <= 1'b0;
-				shreg_now_latency <= 1'b0;
-				shreg_now_unloading <= 1'b0;
-			end
-			//
-		endcase
-		
-		
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START:			fifo_c_rst <= 1'b1;
-			
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH:		if (shreg_done_load)	fifo_c_rst <= 1'b0;
-		endcase
-
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START:			fifo_t_rst <= 1'b1;
-			
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH:		if (shreg_done_load)	fifo_t_rst <= 1'b0;
-		endcase
-
-
-	reg	[32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0]	pe_p_msb_dly;
-	
-	always @(posedge clk)
-		//
-		for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-			pe_p_msb_dly[32 * j - 1 -: 32] <= pe_p[j];
-			
-	wire	[31: 0]	pe_p_lsb_masked = shreg_now_unloading ? pe_p[0] : 32'd0;
-	assign fifo_t_din = {pe_p_lsb_masked, pe_p_msb_dly};
-
-	
-	
-	reg shreg_now_unloading_dly;
-	always @(posedge clk)
-		shreg_now_unloading_dly <= shreg_now_unloading;
-	
-	assign fifo_c_wren = shreg_now_unloading_dly;
-	assign fifo_c_rden = shreg_now_loading;
-	
-	assign fifo_t_wren = shreg_now_unloading_dly;	
-	assign fifo_t_rden = shreg_now_loading;
-	
-	
-		
-		
-	always @(posedge clk) begin
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_A_B_START:				ab_addr_ext		<= bram_addr_ext_zero;
-			FSM_STATE_MULT_AB_N_COEFF_START:		q_addr			<= bram_addr_zero;
-			FSM_STATE_MULT_Q_N_START: begin		qn_addr_ext		<= bram_addr_ext_zero;
-															ab_addr_ext		<= bram_addr_ext_zero;															
-															end
-			
-			FSM_STATE_MULT_A_B_RELOAD:				ab_addr_ext		<= ab_addr_ext_next;
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	q_addr			<= q_addr_next;
-			FSM_STATE_MULT_Q_N_RELOAD: begin		qn_addr_ext		<= qn_addr_ext_next;
-															ab_addr_ext		<= ab_addr_ext_next;
-															end
-		endcase
-		//
-		case (fsm_state)
-			
-			FSM_STATE_MULT_Q_N_RELOAD: begin
-				//
-				if (qn_addr_ext == {1'b0, bram_addr_last}) begin
-					s_addr	<= bram_addr_zero;
-					sn_addr	<= bram_addr_zero;
-				end
-				//
-				if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
-					s_addr <= s_addr_next;
-					sn_addr <= sn_addr_next;
-				end
-				//
-				if (qn_addr_ext == bram_addr_ext_last) begin
-					s_addr <= bram_addr_zero;
-					sn_addr <= bram_addr_zero;
-				end
-				//
-			end			
-		//
-		/*
-		case (fsm_state)
-			
-			FSM_STATE_MULT_Q_N_RELOAD: begin
-				if (qn_addr_ext == {1'b0, bram_addr_last}) begin
-					s_addr	<= bram_addr_zero;
-					sn_addr	<= bram_addr_zero;
-				end
-				
-				if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
-					s_addr <= s_addr_next;
-					sn_addr <= sn_addr_next;
-				end
-
-				if (qn_addr_ext == bram_addr_ext_last) begin
-					s_addr <= bram_addr_zero;
-					sn_addr <= bram_addr_zero;
-				end
-			
-			end
-			
-			FSM_STATE_MULT_Q_N_FINAL,
-			FSM_STATE_SAVE_START,
-			FSM_STATE_SAVE_WRITE: begin
-				s_addr <= !s_addr_done ? s_addr_next : s_addr;
-				sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
-			end
-			*/
-		endcase
-		
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_AB_N_COEFF_START:		ab_addr_ext <= bram_addr_ext_zero;
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	ab_addr_ext <= ab_addr_ext_next;
-		endcase
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_Q_N_START:		q_addr <= bram_addr_zero;
-			FSM_STATE_MULT_Q_N_RELOAD:		q_addr <= !q_addr_done ? q_addr_next : q_addr;
-		endcase
-
-		//
-	end
-		
-	always @(posedge clk) begin
-		//
-		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
-			ab_wren <= shreg_done_latency_dly;
-			ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
-		end else begin
-			ab_wren <= 1'b0;
-			ab_data_in <= 32'hXXXXXXXX;
-		end
-		//
-		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
-			q_wren <= shreg_done_latency_dly;
-			q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
-		end else begin
-			q_wren <= 1'b0;
-			q_data_in <= 32'hXXXXXXXX;
-		end
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
-			qn_wren <= shreg_done_latency_dly;
-			qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
-		end else begin
-			qn_wren <= 1'b0;
-			qn_data_in <= 32'hXXXXXXXX;
-		end		
-		//
-		case (fsm_state)
-			FSM_STATE_SAVE_START:	r_wren <= 1'b1;
-			FSM_STATE_SAVE_WRITE:	r_wren <= ~r_addr_done;
-			default:						r_wren <= 1'b0;
-		endcase
-		//
-	end
-	
-	
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_A_B_START,
-			FSM_STATE_MULT_AB_N_COEFF_START,
-			FSM_STATE_MULT_Q_N_START,
-			FSM_STATE_MULT_A_B_RELOAD,
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
-			FSM_STATE_MULT_Q_N_RELOAD:
-				//
-				syst_cnt_load <= syst_cnt_zero;
-			
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH:
-				//
-				syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-				
-		endcase
-
-		
-		
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_MULT_A_B_CRUNCH,
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
-			FSM_STATE_MULT_Q_N_CRUNCH: begin
-		
-			if (shreg_done_latency)	syst_cnt_unload <= syst_cnt_zero;
-			else if (shreg_now_unloading)
-				syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
-
-			end
-		endcase
-	
-		
-			//
-			// T and C_IN can be moved to a separate code block
-			//
-	always @(posedge clk) begin
-		//
-		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
-			//
-			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-				//
-				if (shreg_now_loading) begin
-					pe_a[j]		<= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
-					pe_b[j]		<= loader_dout[j];
-					//pe_t[j]		<= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
-					//pe_c_in[j]	<= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
-				end else begin
-					pe_a[j]		<= 32'hXXXXXXXX;				
-					pe_b[j]		<= 32'hXXXXXXXX;
-					//pe_t[j]		<= 32'hXXXXXXXX;
-					//pe_c_in[j]	<= 32'hXXXXXXXX;
-				end
-		//
-		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
-			//
-			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-				//
-				if (shreg_now_loading) begin
-					pe_a[j]		<= ab_data_out;
-					pe_b[j]		<= loader_dout[j];
-					//pe_t[j]		<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
-					//pe_c_in[j]	<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
-				end else begin
-					pe_a[j]		<= 32'hXXXXXXXX;				
-					pe_b[j]		<= 32'hXXXXXXXX;
-					//pe_t[j]		<= 32'hXXXXXXXX;
-					//pe_c_in[j]	<= 32'hXXXXXXXX;
-				end
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			//
-			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-				//
-				if (shreg_now_loading) begin
-					pe_a[j]		<= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
-					pe_b[j]		<= loader_dout[j];
-					//pe_t[j]		<= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
-					//pe_c_in[j]	<= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
-				end else begin
-					pe_a[j]		<= 32'hXXXXXXXX;				
-					pe_b[j]		<= 32'hXXXXXXXX;
-					//pe_t[j]		<= 32'hXXXXXXXX;
-					//pe_c_in[j]	<= 32'hXXXXXXXX;
-				end
-		//
-	
-		//
-	end
-		
-		
-		//
-		// Adder
-		//
-
-	reg				add1_ce;					// clock enable
-	wire	[31: 0]	add1_s;					// sum output
-	wire				add1_c_in;				// carry input
-	reg	[31: 0]	add1_a;					// A-input
-	reg	[31: 0]	add1_b;					// B-input
-	reg				add1_c_in_mask;		// flag to not carry anything into the very first word
-	wire				add1_c_out;				// carry output
-	
-		// add masking into carry feedback chain
-	assign add1_c_in = add1_c_out & ~add1_c_in_mask;
-
-		// mask carry for the very first word of N
-	always @(posedge clk)
-		//
-		if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) 
-			add1_c_in_mask <= (ab_addr_ext == bram_addr_ext_zero) ? 1'b1 : 1'b0;
-	
-	modexpa7_adder32 add1_inst
-	(
-		.clk		(clk),
-		.ce		(add1_ce),
-		.a			(add1_a),
-		.b			(add1_b),
-		.c_in		(add1_c_in),
-		.s			(add1_s),
-		.c_out	(add1_c_out)
-	);
-	
-	always @(posedge clk)
-		//
-		add1_ce <= (fsm_next_state == FSM_STATE_MULT_Q_N_ADD_S) ? 1'b1 : 1'b0;
-		
-	always @(posedge clk)
-		//
-		if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) begin
-			add1_a <= pe_p[0];
-			add1_b <= ab_data_out;
-		end
-		
-
-		//
-		// Subtractor
-		//
-		/*
-		 * This subtractor is used to calculate SN = S - N.
-		 *
-		 */
-		 
-	reg				sub1_ce;					// clock enable
-	wire	[31: 0]	sub1_d;					// difference output
-	wire				sub1_b_in;				// borrow input
-	reg	[31: 0]	sub1_a;					// A-input
-	reg	[31: 0]	sub1_b;					// B-input
-	reg				sub1_b_in_mask;		// flag to not borrow anything from the very first word
-	wire				sub1_b_out;				// borrow output
-	
-		// add masking into borrow feedback chain
-	assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
-
-		// mask carry for the very first word of N TODO!
-	//always @(posedge clk)
-		//
-		//if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) 
-			//add1_c_in_mask <= (ab_addr_ext == bram_addr_ext_zero) ? 1'b1 : 1'b0;
-	
-	modexpa7_subtractor32 sub1_inst
-	(
-		.clk		(clk),
-		.ce		(sub1_ce),
-		.a			(sub1_a),
-		.b			(sub1_b),
-		.b_in		(sub1_b_in),
-		.d			(sub1_d),
-		.b_out	(sub1_b_out)
-	);
-	
-	always @(posedge clk)
-		//
-		sub1_ce <= (fsm_next_state == FSM_STATE_MULT_Q_N_SUB_SN) && (qn_addr_ext > {1'b0, q_addr}) ? 1'b1 : 1'b0;
-		
-	always @*
-		sub1_a = add1_s;
-		
-	always @(posedge clk)
-		//
-		//if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) begin
-			//add1_a <= pe_p[0];
-			//add1_b <= ab_data_out;
-		//end
-		 
-		 
-	/*
-	reg				sub1_ce;					// clock enable
-	reg	[31: 0]	sub1_d;					// difference output
-	wire				sub1_b_in;				// borrow input
-	wire	[31: 0]	sub1_a;					// A-input
-	reg	[31: 0]	sub1_b;					// B-input
-	reg				sub1_b_in_mask;		// flag to not borrow anything from the very first word*/
-//	wire	sub1_b_out;				// borrow output
-	/*
-	
-		// add masking into borrow feedback chain
-	assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
-	
-	always @(posedge  clk)
-		//
-		if (sub1_ce)
-			//
-			{sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
-	
-	assign sub1_a = add1_s;
-	
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
-		else
-			sub1_b <= 32'hXXXXXXXX;
-
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
-		else
-			sub1_b_in_mask <= 1'b0;
-
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
-			sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
-		else
-			sub1_ce <= 1'b0;
-			*/
-
-
-	assign s_data_in = add1_s;
-	assign sn_data_in = sub1_d;
-	
-	always @(posedge clk) begin
-		//
-		s_wren <= ((fsm_state == FSM_STATE_MULT_Q_N_ADD_S) && (qn_addr_ext > {1'b0, q_addr})) ? 1'b1 : 1'b0;
-		sn_wren <= ((fsm_state == FSM_STATE_MULT_Q_N_SUB_SN) && (qn_addr_ext > {1'b0, q_addr})) ? 1'b1 : 1'b0;
-		//
-	end			
-		
-
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
-			flag_select_s <= sub1_b_out & ~add1_c_out;
-		
-
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			FSM_STATE_SAVE_START,
-			FSM_STATE_SAVE_WRITE:
-				r_data_in <= flag_select_s ? s_data_out : sn_data_out;
-		endcase
-
-		
-			
-		//
-		// FSM Process
-		//
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
-		else						fsm_state <= fsm_next_state;
-	
-	
-		//
-		// FSM Transition Logic
-		//
-	always @* begin
-		//
-		fsm_next_state = FSM_STATE_STOP;
-		//
-		case (fsm_state)
-
-			FSM_STATE_IDLE:				if (ena_trig)				fsm_next_state = FSM_STATE_LOAD_B_START;
-												else							fsm_next_state = FSM_STATE_IDLE;
-			//
-			FSM_STATE_LOAD_B_START:											fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
-			FSM_STATE_LOAD_B_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_B_WRITE;
-												else								fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
-			FSM_STATE_LOAD_B_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_B_FINAL;
-												else							fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
-			FSM_STATE_LOAD_B_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
-			//
-			FSM_STATE_LOAD_N_COEFF_START:											fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
-			FSM_STATE_LOAD_N_COEFF_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
-												else								fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
-			FSM_STATE_LOAD_N_COEFF_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
-												else							fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
-			FSM_STATE_LOAD_N_COEFF_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_START;
-			//
-			FSM_STATE_LOAD_N_START:											fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
-			FSM_STATE_LOAD_N_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_WRITE;
-												else								fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
-			FSM_STATE_LOAD_N_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_FINAL;
-												else							fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
-			FSM_STATE_LOAD_N_FINAL:										fsm_next_state = FSM_STATE_MULT_A_B_START;
-			//
-			FSM_STATE_MULT_A_B_START:									fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
-			FSM_STATE_MULT_A_B_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
-												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
-			FSM_STATE_MULT_A_B_RELOAD:	if (ab_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
-												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
-			FSM_STATE_MULT_A_B_FINAL:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
-			//
-			FSM_STATE_MULT_AB_N_COEFF_START:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
-			FSM_STATE_MULT_AB_N_COEFF_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
-															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
-			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	if (q_addr_done)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
-															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
-			FSM_STATE_MULT_AB_N_COEFF_FINAL:									fsm_next_state = FSM_STATE_MULT_Q_N_START;
-			//
-			FSM_STATE_MULT_Q_N_START:									fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
-			FSM_STATE_MULT_Q_N_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_Q_N_ADD_S;
-															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
-			FSM_STATE_MULT_Q_N_ADD_S:									fsm_next_state = FSM_STATE_MULT_Q_N_SUB_SN;
-			FSM_STATE_MULT_Q_N_SUB_SN:									fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
-			FSM_STATE_MULT_Q_N_RELOAD:	if (qn_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
-															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
-			FSM_STATE_MULT_Q_N_FINAL:									fsm_next_state = FSM_STATE_SAVE_START;
-			//
-			FSM_STATE_SAVE_START:										fsm_next_state = FSM_STATE_SAVE_WRITE;
-			FSM_STATE_SAVE_WRITE:	if (r_addr_done)				fsm_next_state = FSM_STATE_SAVE_FINAL;
-											else								fsm_next_state = FSM_STATE_SAVE_WRITE;
-			FSM_STATE_SAVE_FINAL:										fsm_next_state = FSM_STATE_STOP;
-			//
-			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
-
-		endcase
-		//
-	end
-
-
-endmodule
-
-//======================================================================
-// End of file
-//======================================================================
diff --git a/src/rtl/modexpa7_wrapper.v b/src/rtl/modexpa7_wrapper.v
index 3b749be..090ea8d 100644
--- a/src/rtl/modexpa7_wrapper.v
+++ b/src/rtl/modexpa7_wrapper.v
@@ -35,7 +35,6 @@ module modexpa7_wrapper #
 		parameter OPERAND_ADDR_WIDTH		= 5,
 		parameter SYSTOLIC_ARRAY_POWER	= 2
 	)
-
 	(
 		input											clk,
 		input											rst_n,
@@ -62,7 +61,7 @@ module modexpa7_wrapper #
 		/*
 		 * Output Mux
 		 */
-	wire	[31: 0]	read_data_regs;
+	reg	[31: 0]	read_data_regs;
 	wire	[31: 0]	read_data_core;
 
 
@@ -75,27 +74,31 @@ module modexpa7_wrapper #
 
 	localparam	[OPERAND_ADDR_WIDTH+1:0]	ADDR_CONTROL			= 'h08;	// {next, init}
 	localparam	[OPERAND_ADDR_WIDTH+1:0]	ADDR_STATUS				= 'h09;	// {valid, ready}
-//	localparam	[OPERAND_ADDR_WIDTH+1:0]	ADDR_MODE							// NOT USED ANYMORE
+	localparam	[OPERAND_ADDR_WIDTH+1:0]	ADDR_MODE				= 'h10;	// {crt, dummy}
 	localparam	[OPERAND_ADDR_WIDTH+1:0]	ADDR_MODULUS_BITS		= 'h11;	// number of bits in modulus
 	localparam	[OPERAND_ADDR_WIDTH+1:0]	ADDR_EXPONENT_BITS	= 'h12;	// number of bits in exponent
 	localparam	[OPERAND_ADDR_WIDTH+1:0]	ADDR_BUFFER_BITS		= 'h13;	// largest supported number of bits
-	localparam	[OPERAND_ADDR_WIDTH+1:0]	ADDR_ARRAY_BITS		= 'h15;	// number of bits in systolic array
+	localparam	[OPERAND_ADDR_WIDTH+1:0]	ADDR_ARRAY_BITS		= 'h14;	// number of bits in systolic array
 
 	localparam	CONTROL_INIT_BIT	= 0;
 	localparam	CONTROL_NEXT_BIT	= 1;
 
 	localparam	STATUS_READY_BIT	= 0;
-	localparam	STATUS_VALID_BIT	= 1;
+	localparam	STATUS_VALID_BIT	= 1;
+	
+	localparam	MODE_DUMMY_BIT		= 0;
+	localparam	MODE_CRT_BIT		= 1;
 
 	localparam	CORE_NAME0			= 32'h6D6F6465;	// "mode"
 	localparam	CORE_NAME1			= 32'h78706137;	// "xpa7"
-	localparam	CORE_VERSION		= 32'h302E3230;	// "0.10"
+	localparam	CORE_VERSION		= 32'h302E3230;	// "0.20"
 
 
 		/*
 		 * Registers
 		 */
-	reg	[                   1:0]	reg_control;
+	reg	[                   1:0]	reg_control;
+	reg	[                   1:1]	reg_mode;
 	reg	[OPERAND_ADDR_WIDTH+5:0]	reg_modulus_bits;
 	reg	[OPERAND_ADDR_WIDTH+5:0]	reg_exponent_bits;
 
@@ -142,34 +145,53 @@ module modexpa7_wrapper #
 		.bus_data_wr			(write_data),
 		.bus_data_rd			(read_data_core)
 	);
-
-
-		/*
-		 * Read Latch
-		 */
-		 
-	reg	[31: 0]	read_data_regs;
 	
 	
 		/*
 		 * Write Checker
 		 */
-
-		 // largest supported operand width
-	localparam	[OPERAND_ADDR_WIDTH+5:0]	BUFFER_BITS	= {1'b1, {OPERAND_ADDR_WIDTH+4{1'b0}}};
+
+		 // largest supported operand width
+	localparam	[OPERAND_ADDR_WIDTH+5:0]	EXPONENT_MIN_BITS	= {{OPERAND_ADDR_WIDTH+4{1'b0}}, 2'b10};
+	localparam	[OPERAND_ADDR_WIDTH+5:0]	EXPONENT_MAX_BITS	= {1'b1, {OPERAND_ADDR_WIDTH+5{1'b0}}};
+	
+	localparam	[OPERAND_ADDR_WIDTH+5:0]	MODULUS_MIN_BITS	= {{OPERAND_ADDR_WIDTH-1{1'b0}}, 7'b1000000};
+	localparam	[OPERAND_ADDR_WIDTH+5:0]	MODULUS_MAX_BITS	= {1'b1, {OPERAND_ADDR_WIDTH+5{1'b0}}};
 		 
-		 // check_modulus_bits
+		 //
+		 // Limits on modulus_bits:
+		 //
+		 // Must be 64 .. BUFFER_BITS in steps of 32
+		 //
 	function	[OPERAND_ADDR_WIDTH+5:0]	check_modulus_bits;
 		input	[OPERAND_ADDR_WIDTH+5:0]	num_bits;
 		begin
-			//
-			//t = num_bits[]
-			//if (num_bits > MAX_BITS)	write_check_bits = MAX_BITS;
-			//else								write_check_bits = num_bits;
-			//
+			
+				// store input value
+			check_modulus_bits = num_bits;
+			
+				// must be multiple of 32
+			check_modulus_bits[4:0] = {5{1'b0}};
+			if (check_modulus_bits < num_bits)
+				check_modulus_bits = check_modulus_bits + 6'd32;
+				
+				// too large?
+			if (check_modulus_bits > MODULUS_MAX_BITS)
+				check_modulus_bits = MODULUS_MAX_BITS;
+			
+				// too small?
+			if (check_modulus_bits < MODULUS_MIN_BITS)
+				check_modulus_bits = MODULUS_MIN_BITS;
+				
 		end
 	endfunction
 
+		//
+		// Limits on exponent_bits:
+		//
+		// Must be 2 .. BUFFER_BITS;
+		//
+		//
 	function	[OPERAND_ADDR_WIDTH+5:0]	check_exponent_bits;
 		input	[OPERAND_ADDR_WIDTH+5:0]	num_bits;
 		begin
@@ -178,12 +200,12 @@ module modexpa7_wrapper #
 			check_exponent_bits = num_bits;
 			
 				// too large?
-			if (num_bits > BUFFER_BITS)
-				check_exponent_bits = BUFFER_BITS;
+			if (check_exponent_bits > EXPONENT_MAX_BITS)
+				check_exponent_bits = EXPONENT_MAX_BITS;
 			
 				// too small?
-			if (num_bits == {OPERAND_ADDR_WIDTH+5{1'b0}})
-				num_bits = {{OPERAND_ADDR_WIDTH+4{1'b0}}, 1'b1};
+			if (check_exponent_bits < EXPONENT_MIN_BITS)
+				check_exponent_bits = EXPONENT_MIN_BITS;
 				
 			//
 		end
@@ -194,9 +216,24 @@ module modexpa7_wrapper #
 		 * Internal Quantities Generator
 		 */
 
-	function	[OPERAND_ADDR_WIDTH-1:0]	modulus_num_words_core;
-		input	[OPERAND_ADDR_WIDTH+5:0]	num_bits;
+
+	function	[OPERAND_ADDR_WIDTH-1:0]	get_modulus_num_words_core;
+		input	[OPERAND_ADDR_WIDTH+5:0]	num_bits;
+		reg	[OPERAND_ADDR_WIDTH+5:0]	num_words_checked;
 		begin
+
+				// check number of bits
+			num_words_checked = check_modulus_bits(num_bits);
+			
+				// reduce by 1
+			num_words_checked = {{5{1'b0}}, num_words_checked[OPERAND_ADDR_WIDTH+5:5]};
+			
+				// reduce by 1
+			num_words_checked = num_words_checked - 1'b1;
+			
+				// return
+			get_modulus_num_words_core = num_words_checked[OPERAND_ADDR_WIDTH-1:0];
+
 		end
 	endfunction
 
@@ -205,14 +242,19 @@ module modexpa7_wrapper #
 		reg	[OPERAND_ADDR_WIDTH+5:0]	num_bits_checked;
 		begin
 			
-				// check number of bits (not too large, not too small)
+				// check number of bits
 			num_bits_checked = check_exponent_bits(num_bits);
 			
-				// de
+				// reduce by 1
+			num_bits_checked = num_bits_checked - 1'b1;
+			
+				// return
+			get_exponent_num_bits_core = num_bits_checked[OPERAND_ADDR_WIDTH+4:0];
+			
 		end
 	endfunction
 												
-
+
 		/*
 		 * Write Interface (External Registers)
 		 */
@@ -229,7 +271,8 @@ module modexpa7_wrapper #
 			//
 			case (address_lsb)
 				//
-				ADDR_CONTROL:			reg_control				<= write_data[ 1: 0];
+				ADDR_CONTROL:			reg_control				<= write_data[ 1: 0];
+				ADDR_MODE:				reg_mode					<= write_data[MODE_CRT_BIT];
 				ADDR_MODULUS_BITS:	reg_modulus_bits		<= check_modulus_bits(write_data[OPERAND_ADDR_WIDTH+5:0]);
 				ADDR_EXPONENT_BITS:	reg_exponent_bits		<= check_exponent_bits(write_data[OPERAND_ADDR_WIDTH+5:0]);
 				//
@@ -265,17 +308,20 @@ module modexpa7_wrapper #
 			//
 			case (address_lsb)
 				//
-				ADDR_NAME0:				tmp_read_data <= CORE_NAME0;
-				ADDR_NAME1:				tmp_read_data <= CORE_NAME1;
-				ADDR_VERSION:			tmp_read_data <= CORE_VERSION;
+				ADDR_NAME0:				read_data_regs <= CORE_NAME0;
+				ADDR_NAME1:				read_data_regs <= CORE_NAME1;
+				ADDR_VERSION:			read_data_regs <= CORE_VERSION;
 				
-				ADDR_CONTROL:			tmp_read_data <= {{30{1'b0}}, reg_control};
-				ADDR_STATUS:			tmp_read_data <= {{30{1'b0}}, reg_status};
+				ADDR_CONTROL:			read_data_regs <= {{30{1'b0}}, reg_control};
+				ADDR_MODE:				read_data_regs <= {{30{1'b0}}, reg_mode, 1'b0};
+				ADDR_STATUS:			read_data_regs <= {{30{1'b0}}, reg_status};
 				
-				ADDR_MODULUS_BITS:	tmp_read_data <= {{19{1'b0}}, reg_modulus_bits};
-				ADDR_EXPONENT_BITS:	tmp_read_data <= {{19{1'b0}}, reg_exponent_bits};
+				ADDR_MODULUS_BITS:	read_data_regs <= {{19{1'b0}}, reg_modulus_bits};
+				ADDR_EXPONENT_BITS:	read_data_regs <= {{19{1'b0}}, reg_exponent_bits};
+				ADDR_BUFFER_BITS:		read_data_regs <= {{26-OPERAND_ADDR_WIDTH  {1'b0}}, 1'b1, {  OPERAND_ADDR_WIDTH+5{1'b0}}};
+				ADDR_ARRAY_BITS:		read_data_regs <= {{26-SYSTOLIC_ARRAY_POWER{1'b0}}, 1'b1, {SYSTOLIC_ARRAY_POWER+5{1'b0}}};
 				//
-				default:					tmp_read_data <= {32{1'b0}};
+				default:					read_data_regs <= {32{1'b0}};
 				//
 			endcase
 
@@ -294,7 +340,7 @@ module modexpa7_wrapper #
 
 	always @(*)
 		//
-		case (address_msb_last)
+		case (address_msb_dly)
 			ADDR_MSB_REGS:		read_data_mux = read_data_regs;
 			ADDR_MSB_CORE:		read_data_mux = read_data_core;
 		endcase
diff --git a/src/tb/tb_exponentiator.v b/src/tb/tb_exponentiator.v
index c9a9f7e..16be0a5 100644
--- a/src/tb/tb_exponentiator.v
+++ b/src/tb/tb_exponentiator.v
@@ -160,7 +160,7 @@ module tb_exponentiator;
 	modexpa7_exponentiator #
 	(
 		.OPERAND_ADDR_WIDTH		(4),	// 32 * (2**4) = 512-bit operands
-		.SYSTOLIC_ARRAY_POWER	(2)	// 2 ** 2 = 4-tap systolic array
+		.SYSTOLIC_ARRAY_POWER	(3)	// 2 ** 2 = 4-tap systolic array
 	)
 	uut
 	(
@@ -207,7 +207,7 @@ module tb_exponentiator;
 		rst_n = 1'b1;
 		#100;
 		
-		//test_exponent_384(M_384, D_384, FACTOR_384, N_384, N_COEFF_384, S_384);
+		test_exponent_384(M_384, D_384, FACTOR_384, N_384, N_COEFF_384, S_384);
 		test_exponent_512(M_512, D_512, FACTOR_512, N_512, N_COEFF_512, S_512);
 		
 	end
diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v
index e9d532e..96e76d5 100644
--- a/src/tb/tb_systolic_multiplier.v
+++ b/src/tb/tb_systolic_multiplier.v
@@ -57,7 +57,7 @@ module tb_systolic_multiplier;
 		//
 		// Model Settings
 		//
-	localparam NUM_ROUNDS = 43;
+	localparam NUM_ROUNDS = 1000;
 	
 	
 		//
@@ -193,7 +193,7 @@ module tb_systolic_multiplier;
 		#100;
 		
 		test_systolic_multiplier_384(M_384, N_384, N_COEFF_384, FACTOR_384, COEFF_384);
-		//test_systolic_multiplier_512(M_512, N_512, N_COEFF_512, FACTOR_512, COEFF_512);
+		test_systolic_multiplier_512(M_512, N_512, N_COEFF_512, FACTOR_512, COEFF_512);
 		
 	end
       
diff --git a/src/tb/tb_wrapper.v b/src/tb/tb_wrapper.v
index bd8dbf1..fae0934 100644
--- a/src/tb/tb_wrapper.v
+++ b/src/tb/tb_wrapper.v
@@ -2,43 +2,108 @@
 
 module tb_wrapper;
 
-	// Inputs
+		/*
+		 * Settings
+		 */
+	localparam	USE_OPERAND_ADDR_WIDTH		= 7;
+	localparam	USE_SYSTOLIC_ARRAY_POWER	= 1;
+
+		/*
+		 * Clock (100 MHz)
+		 */
 	reg clk;
+	initial clk = 1'b0;
+	always #5 clk = ~clk;
+	
+		/*
+		 * Reset
+		 */
 	reg rst_n;
-	reg cs;
-	reg we;
-	reg [7:0] address;
-	reg [31:0] write_data;
-
-	// Outputs
-	wire [31:0] read_data;
+		 
+		 /*
+		  * Access Bus
+		  */
+	reg											bus_cs;
+	reg											bus_we;
+	reg	[USE_OPERAND_ADDR_WIDTH+2:0]	bus_addr;
+	reg	[                    32-1:0]	bus_wr_data;
+	wire	[                    32-1:0]	bus_rd_data;
 
-	// Instantiate the Unit Under Test (UUT)
-	modexpa7_wrapper uut (
-		.clk(clk), 
-		.rst_n(rst_n), 
-		.cs(cs), 
-		.we(we), 
-		.address(address), 
-		.write_data(write_data), 
-		.read_data(read_data)
+	modexpa7_wrapper #
+	(
+		.OPERAND_ADDR_WIDTH		(USE_OPERAND_ADDR_WIDTH),
+		.SYSTOLIC_ARRAY_POWER	(USE_SYSTOLIC_ARRAY_POWER)
+	)
+	uut
+	(
+		.clk			(clk),
+		
+		.rst_n		(rst_n),
+		
+		.cs			(bus_cs), 
+		.we			(bus_we), 
+		.address		(bus_addr), 
+		.write_data	(bus_wr_data), 
+		.read_data	(bus_rd_data)
 	);
 
+	reg	[31: 0]	tmp;
 	initial begin
-		// Initialize Inputs
-		clk = 0;
+		//
 		rst_n = 0;
-		cs = 0;
-		we = 0;
-		address = 0;
-		write_data = 0;
-
-		// Wait 100 ns for global reset to finish
-		#100;
-        
-		// Add stimulus here
-
+		//
+		bus_cs		= 0;
+		bus_we		= 0;
+		bus_addr		= 'bX;
+		bus_wr_data	= 'bX;
+		//
+		#200;
+		//
+		rst_n = 1;
+		//
+		read_reg('h00, tmp);			// NAME0
+		read_reg('h01, tmp);			// NAME1
+		read_reg('h02, tmp);			// VERSION
+		//
+		read_reg('h13, tmp);			// BUFFER_BITS
+		read_reg('h14, tmp);			// ARRAY_BITS
+		//
+		write_reg('h12, 32'd384);	// EXPONENT_BITS
+		read_reg ('h12, tmp);
+		//
+		write_reg('h11, 32'd384);	// MODULUS_BITS
+		read_reg ('h11, tmp);
+		//
+		//
 	end
+	
+	task read_reg;
+		input		[USE_OPERAND_ADDR_WIDTH+1:0]	addr;
+		output	[                    32-1:0]	data;
+		begin
+			bus_cs = 1;
+			bus_addr = {1'b0, addr};
+			#10;
+			bus_cs = 0;
+			bus_addr = 'bX;
+			data = bus_rd_data;
+		end
+	endtask
+
+	task write_reg;
+		input		[USE_OPERAND_ADDR_WIDTH+1:0]	addr;
+		input		[                    32-1:0]	data;
+		begin
+			bus_cs = 1;
+			bus_we = 1;
+			bus_addr = {1'b0, addr};
+			bus_wr_data = data;
+			#10;
+			bus_cs = 0;
+			bus_we = 0;
+			bus_addr = 'bX;
+		end
+	endtask
       
 endmodule
 

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.


More information about the Commits mailing list