[Cryptech-Commits] [core/math/modexpa7] branch systolic updated: Work in progress.

git at cryptech.is git at cryptech.is
Thu Jul 27 07:44:41 UTC 2017


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch systolic
in repository core/math/modexpa7.

The following commit(s) were added to refs/heads/systolic by this push:
     new 9f77c4f  Work in progress.
9f77c4f is described below

commit 9f77c4f559daf20e8b495e26003178c57da93fe2
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Thu Jul 27 10:43:49 2017 +0300

    Work in progress.
---
 src/rtl/modexpa7_systolic_multiplier.v       |    8 +-
 src/rtl/modexpa7_systolic_multiplier_array.v |  335 +++----
 src/rtl/modexpa7_systolic_multiplier_fix.v   | 1202 ++++++++++++++++++++++++++
 3 files changed, 1391 insertions(+), 154 deletions(-)

diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
index 9d96f98..32ed543 100644
--- a/src/rtl/modexpa7_systolic_multiplier.v
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -292,7 +292,8 @@ module modexpa7_systolic_multiplier #
 	bram_p
 	(	.clk(clk),
 		.a_addr(p_addr_ext_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(),
-		.b_addr(p_addr_ext_rd), .b_out(p_data_out));
+		.b_addr(p_addr_ext_rd), .b_out(p_data_out)
+	);
 
 				
 		/*
@@ -397,13 +398,14 @@ module modexpa7_systolic_multiplier #
 
 		.loader_addr_rd	(loader_addr_rd),
 		
-		.pe_a_wide			(),
+		.pe_a_wide			({SYSTOLIC_ARRAY_LENGTH{a_bram_out}}),
 		.pe_b_wide			(pe_b_wide),
 		
+		.a_bram_addr		(a_bram_addr),
+		
 		.p_bram_addr		(p_addr_ext_wr),
 		.p_bram_in			(p_data_in),
 		.p_bram_wr			(p_wren),
-
 
 		.n_num_words		(n_num_words_latch),
 		.p_num_words		(p_num_words_latch)
diff --git a/src/rtl/modexpa7_systolic_multiplier_array.v b/src/rtl/modexpa7_systolic_multiplier_array.v
index 029d9d6..22d5aaf 100644
--- a/src/rtl/modexpa7_systolic_multiplier_array.v
+++ b/src/rtl/modexpa7_systolic_multiplier_array.v
@@ -42,23 +42,25 @@ module modexpa7_systolic_multiplier_array #
 		parameter	SYSTOLIC_ARRAY_POWER		= 2
 	)
 	(
-		input																	clk,
-		input																	rst_n,
+		input																				clk,
+		input																				rst_n,
 
-		input																	ena,
-		output																rdy,
+		input																				ena,
+		output																			rdy,
 
 		output	[OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER - 1 : 0]	loader_addr_rd,
 
-		input		[32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0]	pe_a_wide,
-		input		[32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0]	pe_b_wide,
+		input		[         32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0]	pe_a_wide,
+		input		[         32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0]	pe_b_wide,
+
+		output	[                       OPERAND_ADDR_WIDTH - 1 : 0]	a_bram_addr,
 		
-		output	[              OPERAND_ADDR_WIDTH     : 0]	p_bram_addr,
-		output	[                              32 - 1 : 0]	p_bram_in,
-		output																p_bram_wr,
+		output	[                       OPERAND_ADDR_WIDTH     : 0]	p_bram_addr,
+		output	[                                       32 - 1 : 0]	p_bram_in,
+		output																			p_bram_wr,
 
-		input		[              OPERAND_ADDR_WIDTH - 1 : 0]	n_num_words,
-		input		[              OPERAND_ADDR_WIDTH     : 0]	p_num_words
+		input		[                       OPERAND_ADDR_WIDTH - 1 : 0]	n_num_words,
+		input		[                       OPERAND_ADDR_WIDTH     : 0]	p_num_words
 	);
 	
 		
@@ -75,7 +77,7 @@ module modexpa7_systolic_multiplier_array #
 	localparam	[ 7: 0]	FSM_STATE_IDLE				= 8'h00;
 	
 	localparam	[ 7: 0]	FSM_STATE_MULT_START		= 8'h11;
-	localparam	[ 7: 0]	FSM_STATE_MULT_CRUNCH	= 8'h12;
+	localparam	[ 7: 0]	FSM_STATE_MULT_CRUNCH	= 8'h12;
 	localparam	[ 7: 0]	FSM_STATE_MULT_RELOAD	= 8'h13;
 	localparam	[ 7: 0]	FSM_STATE_MULT_FINAL		= 8'h14;
 	
@@ -138,6 +140,107 @@ module modexpa7_systolic_multiplier_array #
 			
 			
 		/*
+		 * Systolic Cycle Counters
+		 */
+		
+		// handy values 
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+	
+		// counters
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load;
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload;
+		
+		// handy increment values
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_next		= syst_cnt_load   + 1'b1;
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload_next		= syst_cnt_unload + 1'b1;
+
+		// handy stop flags
+	wire										syst_cnt_load_done		= (syst_cnt_load   == syst_cnt_last) ? 1'b1 : 1'b0;
+	wire										syst_cnt_unload_done		= (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
+
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_START,
+			FSM_STATE_MULT_RELOAD:
+				//
+				syst_cnt_load <= syst_cnt_zero;
+			
+			FSM_STATE_MULT_CRUNCH:
+				//
+				syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+				
+		endcase
+		
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_CRUNCH) begin
+			//	
+			if (shreg_done_latency)
+				syst_cnt_unload <= syst_cnt_zero;
+			else if (shreg_now_unloading)
+				syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
+			//
+		end
+
+			
+		/*
+		 * Timing Shift Registers
+		 */
+		 
+	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_load;
+	reg	[SYSTOLIC_PE_LATENCY  :0]	shreg_latency;
+	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_unload;
+
+	wire	shreg_done_load		= shreg_load[syst_cnt_last];
+	wire	shreg_done_latency	= shreg_latency[SYSTOLIC_PE_LATENCY];
+	wire	shreg_done_unload		= shreg_unload[syst_cnt_last];
+
+	reg	shreg_now_loading;
+	reg	shreg_now_latency;
+	reg	shreg_now_unloading;
+	
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			//
+			FSM_STATE_MULT_START,
+			FSM_STATE_MULT_RELOAD: begin
+				//
+				shreg_now_loading		<= 1'b1;
+				shreg_now_latency		<= 1'b1;
+				shreg_now_unloading	<= 1'b0;
+				//
+				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
+				shreg_latency	<= {{SYSTOLIC_PE_LATENCY  {1'b0}}, 1'b1};
+				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+				//
+			end
+			//
+			FSM_STATE_MULT_CRUNCH: begin
+				//
+				shreg_load		<= {shreg_load   [SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
+				shreg_latency	<= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
+				shreg_unload	<= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
+				//
+				if (shreg_done_load)				shreg_now_loading <= 1'b0;
+				if (shreg_done_latency)			shreg_now_latency <= 1'b0;
+				if (shreg_done_latency)			shreg_now_unloading <= 1'b1;
+				else if (shreg_done_unload)	shreg_now_unloading <= 1'b0;
+				
+			end
+			//
+			default: begin
+				shreg_now_loading		<= 1'b0;
+				shreg_now_latency		<= 1'b0;
+				shreg_now_unloading	<= 1'b0;
+			end
+			//
+		endcase
+			
+			
+		/*
 		 * Systolic Array of Processing Elements
 		 */
 	reg	[31: 0]	pe_a        [0:SYSTOLIC_ARRAY_LENGTH-1];
@@ -215,195 +318,125 @@ module modexpa7_systolic_multiplier_array #
 			assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
 			assign pe_t[i]    = fifo_t_dout[32 * (i + 1) - 1 -: 32];
 			//
-			//assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
-			//
-			//always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
-			//
 		end
 		//
 	endgenerate
 
-		
+
+		/*
+		 * FIFO Reset Logic
+		 */
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_MULT_START:									fifo_c_rst <= 1'b1;
+			FSM_STATE_MULT_CRUNCH:	if (shreg_done_load)		fifo_c_rst <= 1'b0;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_MULT_START:									fifo_t_rst <= 1'b1;
+			FSM_STATE_MULT_CRUNCH:	if (shreg_done_load)		fifo_t_rst <= 1'b0;
+		endcase
+
+
 		/*
 		 * Block Memory Interface
 		 */
 
 		// the very first address
-	wire	[OPERAND_ADDR_WIDTH:0]	bram_addr_zero = {OPERAND_ADDR_WIDTH+1{1'b0}};
+	wire	[OPERAND_ADDR_WIDTH - 1 : 0]	bram_addr_zero     = {OPERAND_ADDR_WIDTH  {1'b0}};
+	wire	[OPERAND_ADDR_WIDTH     : 0]	bram_addr_ext_zero = {OPERAND_ADDR_WIDTH+1{1'b0}};
 	
 		// the very last address
-	wire	[OPERAND_ADDR_WIDTH:0]	bram_addr_last = p_num_words_latch;
+	wire	[OPERAND_ADDR_WIDTH - 1 : 0]	bram_addr_last     = n_num_words_latch;
+	wire	[OPERAND_ADDR_WIDTH     : 0]	bram_addr_ext_last = p_num_words_latch;
 		
 		// registers
-	reg	[OPERAND_ADDR_WIDTH:0]	p_addr;
-	reg	[                31:0]	p_data_in;
-	reg									p_wren;
+	reg	[OPERAND_ADDR_WIDTH - 1 : 0]	a_addr;
+	reg	[OPERAND_ADDR_WIDTH     : 0]	p_addr;
+	reg	[                32 - 1 : 0]	p_data_in;
+	reg											p_wren;
 
 		// handy values 
-	wire	[OPERAND_ADDR_WIDTH:0]	p_addr_next = p_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH - 1 : 0]	a_addr_next = a_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH     : 0]	p_addr_next = p_addr + 1'b1;
 	
 		// handy flags
-	wire	p_addr_done =  (p_addr == bram_addr_last) ? 1'b1 : 1'b0;
-
+	wire	a_addr_done = (a_addr == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	p_addr_done = (p_addr == bram_addr_ext_last) ? 1'b1 : 1'b0;
 	
 		// map top-level ports to internal registers
+	assign a_bram_addr	= a_addr;
 	assign p_bram_addr	= p_addr;
 	assign p_bram_in		= p_data_in;
 	assign p_bram_wr		= p_wren;
 
-
-		/*
-		 * Systolic Cycle Counters
-		 */
-		
-		// handy values 
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-	
-		// counters
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load;
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload;
-		
-		// handy increment values
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_next		= syst_cnt_load   + 1'b1;
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload_next		= syst_cnt_unload + 1'b1;
-
-		// handy stop flags
-	wire										syst_cnt_load_done		= (syst_cnt_load   == syst_cnt_last) ? 1'b1 : 1'b0;
-	wire										syst_cnt_unload_done		= (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
-
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_START,
-			FSM_STATE_MULT_RELOAD:
-				//
-				syst_cnt_load <= syst_cnt_zero;
-			
-			FSM_STATE_MULT_CRUNCH,
-				//
-				syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-				
-		endcase
-		
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_MULT_CRUNCH) begin
-			//	
-			if (shreg_done_latency)
-				syst_cnt_unload <= syst_cnt_zero;
-			else if (shreg_now_unloading)
-				syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
-			//
-		end
-
-
-
-		/*
-		 * Shift Registers
-		 */
-	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_load;
-	reg	[SYSTOLIC_PE_LATENCY  :0]	shreg_latency;
-	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_unload;
-
-	wire	shreg_done_load		= shreg_load[syst_cnt_last];
-	wire	shreg_done_latency	= shreg_latency[SYSTOLIC_PE_LATENCY];
-	wire	shreg_done_unload		= shreg_unload[syst_cnt_last];
-
-	reg	shreg_now_loading;
-	reg	shreg_now_latency;
-	reg	shreg_now_unloading;
-	
+	integer j;
 	always @(posedge clk)
 		//
-		case (fsm_state)
+		if (fsm_state == FSM_STATE_MULT_CRUNCH)
 			//
-			//FSM_STATE_IDLE: begin
-				//shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
-				//shreg_latency	<= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
-				//shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
-			//end
-			//
-			FSM_STATE_MULT_START,
-			FSM_STATE_MULT_RELOAD: begin
+			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
 				//
-				shreg_now_loading		<= 1'b1;
-				shreg_now_latency		<= 1'b1;
-				shreg_now_unloading	<= 1'b0;
-				//
-				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
-				shreg_latency	<= {{SYSTOLIC_PE_LATENCY  {1'b0}}, 1'b1};
-				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
-				//
-			end
-			//
-			FSM_STATE_MULT_CRUNCH: begin
-				//
-				shreg_load		<= {shreg_load   [SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
-				shreg_latency	<= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
-				shreg_unload	<= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
-				//
-				if (shreg_done_load)				shreg_now_loading <= 1'b0;
-				if (shreg_done_latency)			shreg_now_latency <= 1'b0;
-				if (shreg_done_latency)			shreg_now_unloading <= 1'b1;
-				else if (shreg_done_unload)	shreg_now_unloading <= 1'b0;
-				
-			end
-			//
-			default: begin
-				shreg_now_loading		<= 1'b0;
-				shreg_now_latency		<= 1'b0;
-				shreg_now_unloading	<= 1'b0;
-			end
-			//
-		endcase
+				if (shreg_now_loading) begin
+					pe_a[j]		<= (p_addr > {1'b0, a_addr}) ? 32'd0 : pe_a_wide[32 * (j + 1) - 1 -: 32];
+					pe_b[j]		<= pe_b_wide[32 * (j + 1) - 1 -: 32];
+				end else begin
+					pe_a[j]		<= 32'hXXXXXXXX;				
+					pe_b[j]		<= 32'hXXXXXXXX;
+				end
 
 
 
+//		/*
+//		 *
+//		 */
+//	always @(posedge clk)
+//		//
+//		case (fsm_next_state)
+//			FSM_STATE_MULT_RELOAD:	p_wren <= 1'b1;
+//			default:						p_wren <= 1'b0;
+//		endcase
+//
 		/*
-		 *
-		 */
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-			FSM_STATE_MULT_RELOAD:	p_wren <= 1'b1;
-			default:						p_wren <= 1'b0;
-		endcase
-
-		/*
-		 *
+		 * Block Memory Address Control
 		 */
-	always @(posedge clk)
+	always @(posedge clk) begin
 		//
 		case (fsm_state)
 			FSM_STATE_MULT_START:	p_addr <= bram_addr_zero;
 			FSM_STATE_MULT_RELOAD:	p_addr <= p_addr_next;
+		endcase
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_START:	a_addr <= bram_addr_zero;
+			FSM_STATE_MULT_RELOAD:	a_addr <= !a_addr_done ? a_addr_next : a_addr;
 		endcase
+		//
+	end
 
 		
 		/*
-		 * Loader Control
+		 * Loader Address Control
 		 */
 	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr;
 
 	assign loader_addr_rd = loader_addr;
 
-	integer j;
 	always @(posedge clk)
 		//
 		case (fsm_next_state)
-		
-			FSM_STATE_MULT_START,
+			//
+			FSM_STATE_MULT_START,
 			FSM_STATE_MULT_RELOAD:
-				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr <= syst_cnt_zero;
-													
+				loader_addr <= syst_cnt_zero;
+			//									
 			FSM_STATE_MULT_CRUNCH:
 				//
-				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-					loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-					
+				loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+			//	
 		endcase
 
 
@@ -433,7 +466,7 @@ module modexpa7_systolic_multiplier_array #
 			//
 			FSM_STATE_MULT_START:									fsm_next_state = FSM_STATE_MULT_CRUNCH;
 			FSM_STATE_MULT_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_RELOAD;
-											else							fsm_next_state = FSM_STATE_MULT_CRUNCH;
+											else							fsm_next_state = FSM_STATE_MULT_CRUNCH;
 			FSM_STATE_MULT_RELOAD:	if (p_addr_done)			fsm_next_state = FSM_STATE_MULT_FINAL;
 											else							fsm_next_state = FSM_STATE_MULT_CRUNCH;
 			FSM_STATE_MULT_FINAL:									fsm_next_state = FSM_STATE_STOP;
diff --git a/src/rtl/modexpa7_systolic_multiplier_fix.v b/src/rtl/modexpa7_systolic_multiplier_fix.v
new file mode 100644
index 0000000..40b2144
--- /dev/null
+++ b/src/rtl/modexpa7_systolic_multiplier_fix.v
@@ -0,0 +1,1202 @@
+//======================================================================
+//
+// modexpa7_systolic_multiplier.v
+// -----------------------------------------------------------------------------
+// Systolic Montgomery multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+module modexpa7_systolic_multiplier #
+	(
+			//
+			// This sets the address widths of memory buffers. Internal data
+			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
+			// 2048 / 32 = 64 words, and these need 6-bit address bus, because
+			// 2 ** 6 = 64.
+			//
+		parameter	OPERAND_ADDR_WIDTH		= 4,
+		
+			//
+			// Explain.
+			//
+		parameter	SYSTOLIC_ARRAY_POWER		= 1
+	)
+	(
+		input											clk,
+		input											rst_n,
+
+		input											ena,
+		output										rdy,
+
+		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,
+
+		input		[                32-1:0]	a_bram_out,
+		input		[                32-1:0]	b_bram_out,
+		input		[                32-1:0]	n_bram_out,
+		input		[                32-1:0]	n_coeff_bram_out,
+
+		output	[                32-1:0]	r_bram_in,
+		output										r_bram_wr,
+
+		input		[OPERAND_ADDR_WIDTH-1:0]	ab_num_words
+	);
+	
+		
+		//
+		// Include Settings
+		//
+	`include "pe/modexpa7_primitive_switch.v"
+	`include "modexpa7_settings.v"
+		
+
+		//
+		// FSM Declaration
+		//
+	localparam	[ 7: 0]	FSM_STATE_IDLE								= 8'h00;
+
+	localparam	[ 7: 0]	FSM_STATE_LOAD_B_START					= 8'h11;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_B_SHIFT					= 8'h12;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_B_WRITE					= 8'h13;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_B_FINAL					= 8'h14;
+
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_START			= 8'h21;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_SHIFT			= 8'h22;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_WRITE			= 8'h23;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_FINAL			= 8'h24;
+
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_START					= 8'h31;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_SHIFT					= 8'h32;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_WRITE					= 8'h33;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_FINAL					= 8'h34;
+
+	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_START				= 8'h41;
+	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_CRUNCH				= 8'h42;
+	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_RELOAD				= 8'h43;
+	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_FINAL				= 8'h44;
+
+	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_START		= 8'h51;
+	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_CRUNCH		= 8'h52;
+	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_RELOAD		= 8'h53;
+	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_FINAL		= 8'h54;
+
+	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_START				= 8'h61;
+	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_CRUNCH				= 8'h62;
+	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_RELOAD				= 8'h63;
+	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_FINAL				= 8'h64;
+	
+	localparam	[ 7: 0]	FSM_STATE_SAVE_START						= 8'h71;
+	localparam	[ 7: 0]	FSM_STATE_SAVE_WRITE						= 8'h72;
+	localparam	[ 7: 0]	FSM_STATE_SAVE_FINAL						= 8'h73;	
+	
+	localparam	[ 7: 0]	FSM_STATE_STOP								= 8'hFF;
+	
+		//
+		// FSM State / Next State
+		//
+	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
+	reg	[ 7: 0]	fsm_next_state;
+
+
+		//
+		// Enable Delay and Trigger
+		//
+   reg ena_dly = 1'b0;
+	
+		/* delay enable by one clock cycle */
+   always @(posedge clk) ena_dly <= ena;
+
+		/* trigger new operation when enable goes high */
+   wire ena_trig = ena && !ena_dly;
+	
+	
+		//
+		// Ready Flag Logic
+		//
+	reg rdy_reg = 1'b1;
+	assign rdy = rdy_reg;
+
+   always @(posedge clk or negedge rst_n)
+		
+			/* reset flag */
+		if (rst_n == 1'b0) rdy_reg <= 1'b1;
+		else begin
+		
+				/* clear flag when operation is started */
+			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
+			
+				/* set flag after operation is finished */
+			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
+			
+		end
+		
+		
+		//
+		// Parameters Latch
+		//
+	reg	[OPERAND_ADDR_WIDTH-1:0]	ab_num_words_latch;
+
+		/* save number of words in a and b when new operation starts */
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_LOAD_B_START)
+			ab_num_words_latch <= ab_num_words;
+			
+			
+		//
+		// Systolic Cycle Counters
+		//
+		
+		/* handy values */
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+	
+		/* counters */
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init;
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load;
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload;
+		
+		/* handy increment values */
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init_next		= syst_cnt_init   + 1'b1;
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_next		= syst_cnt_load   + 1'b1;
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload_next		= syst_cnt_unload + 1'b1;
+
+		/* handy stop flags */
+	wire										syst_cnt_init_done		= (syst_cnt_init   == syst_cnt_last) ? 1'b1 : 1'b0;
+	wire										syst_cnt_load_done		= (syst_cnt_load   == syst_cnt_last) ? 1'b1 : 1'b0;
+	wire										syst_cnt_unload_done		= (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
+
+		/* delayed load counter */
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_dly;
+	always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
+
+
+		//
+		// Multiplier Iteration Counter
+		//
+		
+		/* handy values */
+	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
+	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
+	
+		/* counter */
+	reg	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt;
+	
+		/* handy increment value and stop flag */
+	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_next = mult_cnt + 1'b1;
+	wire										mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
+			
+			
+		//
+		// Initialization Counter Control Logic
+		//
+	always @(posedge clk) begin
+		//
+		case (fsm_state)
+			FSM_STATE_LOAD_B_START,
+			FSM_STATE_LOAD_N_COEFF_START,
+			FSM_STATE_LOAD_N_START:				mult_cnt <= mult_cnt_zero;
+			
+			FSM_STATE_LOAD_B_SHIFT,
+			FSM_STATE_LOAD_N_COEFF_SHIFT,
+			FSM_STATE_LOAD_N_SHIFT:				mult_cnt <= mult_cnt_next;
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_LOAD_B_START,
+			FSM_STATE_LOAD_N_COEFF_START,
+			FSM_STATE_LOAD_N_START:				syst_cnt_init <= syst_cnt_zero;
+			
+			FSM_STATE_LOAD_B_WRITE,
+			FSM_STATE_LOAD_N_COEFF_WRITE,
+			FSM_STATE_LOAD_N_WRITE:				syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
+		endcase
+		//
+	end
+	
+	
+		//
+		// Operand Loader
+		//
+	
+		/*
+		 * Explain how parallelized loader works here...
+		 *
+		 */
+	
+		/* loader banks */
+	localparam	[ 1: 0]	LOADER_ADDR_MSB_B				= 2'd0;
+	localparam	[ 1: 0]	LOADER_ADDR_MSB_N_COEFF		= 2'd1;
+	localparam	[ 1: 0]	LOADER_ADDR_MSB_N				= 2'd2;
+	
+		/* loader input */
+	reg	[                  2-1:0]	loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];	
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
+	reg										loader_wren    [0:SYSTOLIC_ARRAY_LENGTH-1];
+	reg	[                 32-1:0]	loader_din     [0:SYSTOLIC_ARRAY_LENGTH-1];
+	
+		/* loader output */
+	wire	[                 32-1:0]	loader_dout    [0:SYSTOLIC_ARRAY_LENGTH-1];
+			
+		/* generate parallelized loader */
+		
+		//
+		// Loader currently stores B, N_COEFF and N, it can be coded another way
+		// to initially store B, then AB, then Q. Some memory can be saved thay way.
+		// Maybe later...
+		//
+		
+	genvar i;
+	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
+		//
+		begin : gen_bram_1rw_readfirst_loader
+			//
+			bram_1rw_readfirst #
+			(
+				.MEM_WIDTH		(32),
+				.MEM_ADDR_BITS	(SYSTOLIC_CNTR_WIDTH + 2)
+			)
+			bram_loader
+			(
+				.clk		(clk),
+				.a_addr	({loader_addr_msb[i], loader_addr_lsb[i]}),
+				.a_wr		(loader_wren[i]),
+				.a_in		(loader_din[i]),
+				.a_out	(loader_dout[i])
+			);
+			//
+		end
+		//
+	endgenerate
+	
+
+		//
+		// Block Memory Addresses
+		//
+		
+		/*
+		 * Explain why there are two memory sizes.
+		 *
+		 */
+		
+		/* the very first addresses */
+	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero			= {      {OPERAND_ADDR_WIDTH{1'b0}}};
+	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_zero	= {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
+	
+		/* the very last addresses */
+	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last     = {ab_num_words_latch};
+	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_last = {ab_num_words_latch, 1'b1};
+
+		/* address registers */
+	reg	[OPERAND_ADDR_WIDTH-1:0]	a_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr;
+	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	q_addr;
+	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	s_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr;
+		
+		/* handy increment values */
+	wire	[OPERAND_ADDR_WIDTH-1:0]	a_addr_next			= a_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next			= b_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_next	= n_coeff_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next			= n_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_next	= ab_addr_ext  + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_next			= q_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_next	= qn_addr_ext  + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_addr_next			= s_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_addr_next		= sn_addr      + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next			= r_addr       + 1'b1;
+	
+		/* handy stop flags */
+	wire	a_addr_done			= (a_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	b_addr_done			= (b_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	n_coeff_addr_done	= (n_coeff_addr  == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	n_addr_done			= (n_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	ab_addr_ext_done	= (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
+	wire	q_addr_done			= (q_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	qn_addr_ext_done	= (qn_addr_ext     == bram_addr_ext_last)     ? 1'b1 : 1'b0;
+	wire	s_addr_done	= (s_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	sn_addr_done	= (sn_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	r_addr_done	= (r_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
+
+		/* delayed B address */
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
+	always @(posedge clk) b_addr_dly <= b_addr;
+
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_dly;
+	always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
+
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr_dly;
+	always @(posedge clk) n_addr_dly <= n_addr;
+				
+		/* map registers to top-level ports */
+	assign a_bram_addr = a_addr;
+	assign b_bram_addr = b_addr;
+	assign n_coeff_bram_addr = n_coeff_addr;
+	assign n_bram_addr = n_addr;
+	assign r_bram_addr = r_addr;
+
+
+		//
+		// Flag
+		//
+	reg	flag_select_s;
+	
+	
+		//
+		// Memory Address Control Logic
+		//
+	always @(posedge clk) begin
+		//
+		case (fsm_next_state)
+			FSM_STATE_LOAD_B_START:				b_addr <= bram_addr_zero;
+			FSM_STATE_LOAD_N_COEFF_START:		n_coeff_addr <= bram_addr_zero;
+			FSM_STATE_LOAD_N_START:				n_addr <= bram_addr_zero;
+			
+			FSM_STATE_LOAD_B_SHIFT:				b_addr <= b_addr_next;
+			FSM_STATE_LOAD_N_COEFF_SHIFT:		n_coeff_addr <= n_coeff_addr_next;
+			FSM_STATE_LOAD_N_SHIFT:				n_addr <= n_addr_next;
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_MULT_Q_N_RELOAD: 
+				if (qn_addr_ext == {1'b0, bram_addr_last})
+					n_addr		<= bram_addr_zero;
+				else if (qn_addr_ext > {1'b0, bram_addr_last})
+					n_addr		<= n_addr_next;
+			
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_START:	r_addr <= bram_addr_zero;
+			FSM_STATE_SAVE_WRITE:	r_addr <= r_addr_next;
+		endcase
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_A_B_START:	a_addr <= bram_addr_zero;
+			FSM_STATE_MULT_A_B_RELOAD:	a_addr <= !a_addr_done ? a_addr_next : a_addr;
+		endcase
+		//
+	end
+	
+	
+		//
+		// Internal Memories
+		//
+
+		/* memory inputs */
+	reg	[31: 0]	ab_data_in;
+	reg	[31: 0]	q_data_in;
+	reg	[31: 0]	qn_data_in;
+	wire	[31: 0]	s_data_in;
+	wire	[31: 0]	sn_data_in;
+	reg	[31: 0]	r_data_in;
+
+		/* memory outputs */
+	wire	[31: 0]	ab_data_out;
+	wire	[31: 0]	q_data_out;
+	wire	[31: 0]	qn_data_out;
+	wire	[31: 0]	s_data_out;
+	wire	[31: 0]	sn_data_out;
+
+		/* write enables */
+	reg	ab_wren;
+	reg	q_wren;
+	reg	qn_wren;
+	reg	s_wren;
+	reg	sn_wren;
+	reg	r_wren;
+	
+		/* map */
+	assign r_bram_in = r_data_in;
+	assign r_bram_wr = r_wren;
+
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+	bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
+
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
+	
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+	bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
+
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
+
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
+
+	
+		//
+		// Wide Operand Loader
+		//
+	integer j;
+	
+		/* shift logic */
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			//
+			FSM_STATE_LOAD_B_SHIFT: begin
+		
+						/* update the rightmost part of loader buffer */
+				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
+				
+						/* shift the loader buffer to the left */
+				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_din[j-1] <= loader_din[j];
+					
+			end
+			//
+			FSM_STATE_LOAD_N_COEFF_SHIFT: begin
+		
+						/* update the rightmost part of loader buffer */
+				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
+				
+						/* shift the loader buffer to the left */
+				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_din[j-1] <= loader_din[j];
+					
+			end
+			//
+			FSM_STATE_LOAD_N_SHIFT: begin
+		
+						/* update the rightmost part of loader buffer */
+				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
+				
+						/* shift the loader buffer to the left */
+				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_din[j-1] <= loader_din[j];
+					
+			end					
+			//
+		endcase
+		
+
+		/* write enable logic */
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+		
+			FSM_STATE_LOAD_B_WRITE,
+			FSM_STATE_LOAD_N_COEFF_WRITE,
+			FSM_STATE_LOAD_N_WRITE:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_wren[j] <= 1'b1;
+					
+			default:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_wren[j] <= 1'b0;
+					
+		endcase
+
+		/* loader address update logic */
+	always @(posedge clk) begin
+		//
+		case (fsm_state)
+		
+			FSM_STATE_LOAD_B_START,
+			FSM_STATE_LOAD_N_COEFF_START,
+			FSM_STATE_LOAD_N_START:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_lsb[j] <= syst_cnt_zero;
+					
+			FSM_STATE_LOAD_B_WRITE,
+			FSM_STATE_LOAD_N_COEFF_WRITE,
+			FSM_STATE_LOAD_N_WRITE:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
+					
+		endcase
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_A_B_START,
+			FSM_STATE_MULT_AB_N_COEFF_START,
+			FSM_STATE_MULT_Q_N_START,
+			FSM_STATE_MULT_A_B_RELOAD,
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+			FSM_STATE_MULT_Q_N_RELOAD:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_lsb[j] <= syst_cnt_zero;
+													
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
+		endcase
+		//
+		case (fsm_next_state)
+		
+			FSM_STATE_LOAD_B_START,
+			FSM_STATE_MULT_A_B_START:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
+
+			FSM_STATE_LOAD_N_COEFF_START,
+			FSM_STATE_MULT_AB_N_COEFF_START:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
+					
+			FSM_STATE_LOAD_N_START,
+			FSM_STATE_MULT_Q_N_START:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
+
+		endcase
+		//
+	end
+	
+	
+		//
+		// Systolic Array of Processing Elements
+		//
+	reg	[31: 0]	pe_a        [0:SYSTOLIC_ARRAY_LENGTH-1];
+	reg	[31: 0]	pe_b        [0:SYSTOLIC_ARRAY_LENGTH-1];
+	wire	[31: 0]	pe_t        [0:SYSTOLIC_ARRAY_LENGTH-1];
+	wire	[31: 0]	pe_c_in     [0:SYSTOLIC_ARRAY_LENGTH-1];
+	wire	[31: 0]	pe_p        [0:SYSTOLIC_ARRAY_LENGTH-1];
+	wire	[31: 0]	pe_c_out    [0:SYSTOLIC_ARRAY_LENGTH-1];
+	reg	[31: 0]	pe_c_out_dly[0:SYSTOLIC_ARRAY_LENGTH-1];
+	
+
+		//
+		// These can be turned into a FIFO (maybe later?)...
+		//
+	//reg	[31: 0]	pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
+	//reg	[31: 0]	pe_t_mem    [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
+
+	reg	fifo_c_rst;
+	reg	fifo_t_rst;
+
+	wire	fifo_c_wren;
+	wire	fifo_c_rden;
+	
+	wire	fifo_t_wren;
+	wire	fifo_t_rden;
+		
+	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_c_din;
+	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_c_dout;
+	
+	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_t_din;
+	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	fifo_t_dout;
+	
+	/**/
+	modexpa7_simple_fifo #
+	(
+		.BUS_WIDTH	(32 * SYSTOLIC_ARRAY_LENGTH),
+		.DEPTH_BITS	(SYSTOLIC_CNTR_WIDTH)
+	)
+	fifo_c
+	(
+		.clk			(clk),
+		.rst			(fifo_c_rst),
+		.wr_en		(fifo_c_wren),
+		.d_in			(fifo_c_din),
+		.rd_en		(fifo_c_rden),
+		.d_out		(fifo_c_dout)
+	);
+	
+	modexpa7_simple_fifo #
+	(
+		.BUS_WIDTH	(32 * SYSTOLIC_ARRAY_LENGTH),
+		.DEPTH_BITS	(SYSTOLIC_CNTR_WIDTH)
+	)
+	fifo_t
+	(
+		.clk			(clk),
+		.rst			(fifo_t_rst),
+		.wr_en		(fifo_t_wren),
+		.d_in			(fifo_t_din),
+		.rd_en		(fifo_t_rden),
+		.d_out		(fifo_t_dout)
+	);
+	
+	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
+		begin : modexpa7_systolic_pe_multiplier		
+			modexpa7_systolic_pe systolic_pe_inst
+			(
+				.clk		(clk),
+				.a			(pe_a[i]),
+				.b			(pe_b[i]),
+				.t			(pe_t[i]),
+				.c_in		(pe_c_in[i]),
+				.p			(pe_p[i]),
+				.c_out	(pe_c_out[i])
+			);
+			assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
+			assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
+			assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
+			always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
+		end
+	endgenerate
+
+
+
+		
+			
+			//
+			// Shift Registers
+			//
+	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_load;
+	reg	[SYSTOLIC_PE_LATENCY  :0]	shreg_latency;
+	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_unload;
+
+	wire	shreg_done_load = shreg_load[syst_cnt_last];
+	wire	shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
+	wire	shreg_done_unload = shreg_unload[syst_cnt_last];
+
+	reg										shreg_now_loading;
+	reg										shreg_now_latency;
+	reg										shreg_now_unloading;
+	
+	reg										shreg_done_latency_dly;
+	
+	always @(posedge clk)
+		shreg_done_latency_dly <= shreg_done_latency;
+
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_LOAD_N_FINAL: begin
+				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+				shreg_latency	<= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
+				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+			end
+			//
+			FSM_STATE_MULT_A_B_START,
+			FSM_STATE_MULT_AB_N_COEFF_START,
+			FSM_STATE_MULT_Q_N_START,
+			FSM_STATE_MULT_A_B_RELOAD,
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+			FSM_STATE_MULT_Q_N_RELOAD: begin
+				shreg_now_loading	<= 1'b1;
+				shreg_now_latency <= 1'b1;
+				shreg_now_unloading <= 1'b0;
+				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
+				shreg_latency	<= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
+				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+			end
+			//
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH: begin
+				shreg_load		<= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
+				shreg_latency	<= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
+				shreg_unload	<= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
+				
+				if (shreg_done_load) shreg_now_loading <= 1'b0;
+				if (shreg_done_latency) shreg_now_latency <= 1'b0;
+				if (shreg_done_latency) shreg_now_unloading <= 1'b1;
+				else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
+				
+			end
+			//
+			default: begin
+				shreg_now_loading <= 1'b0;
+				shreg_now_latency <= 1'b0;
+				shreg_now_unloading <= 1'b0;
+			end
+			//
+		endcase
+		
+		
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_MULT_A_B_START,
+			FSM_STATE_MULT_AB_N_COEFF_START,
+			FSM_STATE_MULT_Q_N_START:			fifo_c_rst <= 1'b1;
+			
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH:		if (shreg_done_load)	fifo_c_rst <= 1'b0;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_MULT_A_B_START,
+			FSM_STATE_MULT_AB_N_COEFF_START,
+			FSM_STATE_MULT_Q_N_START:			fifo_t_rst <= 1'b1;
+			
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH:		if (shreg_done_load)	fifo_t_rst <= 1'b0;
+		endcase
+
+
+	reg	[32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0]	pe_p_msb_dly;
+	
+	always @(posedge clk)
+		//
+		for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+			pe_p_msb_dly[32 * j - 1 -: 32] <= pe_p[j];
+			
+	wire	[31: 0]	pe_p_lsb_masked = shreg_now_unloading ? pe_p[0] : 32'd0;
+	assign fifo_t_din = {pe_p_lsb_masked, pe_p_msb_dly};
+
+	
+	
+	reg shreg_now_unloading_dly;
+	always @(posedge clk)
+		shreg_now_unloading_dly <= shreg_now_unloading;
+	
+	assign fifo_c_wren = shreg_now_unloading_dly;
+	assign fifo_c_rden = shreg_now_loading;
+	
+	assign fifo_t_wren = shreg_now_unloading_dly;	
+	assign fifo_t_rden = shreg_now_loading;
+	
+	
+		
+		
+	always @(posedge clk) begin
+		//
+		case (fsm_state)
+			FSM_STATE_MULT_A_B_START:				ab_addr_ext		<= bram_addr_ext_zero;
+			FSM_STATE_MULT_AB_N_COEFF_START:		q_addr			<= bram_addr_zero;
+			FSM_STATE_MULT_Q_N_START: begin		qn_addr_ext		<= bram_addr_ext_zero;
+															ab_addr_ext		<= bram_addr_ext_zero;															
+															end
+			
+			FSM_STATE_MULT_A_B_RELOAD:				ab_addr_ext		<= ab_addr_ext_next;
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	q_addr			<= q_addr_next;
+			FSM_STATE_MULT_Q_N_RELOAD: begin		qn_addr_ext		<= qn_addr_ext_next;
+															ab_addr_ext		<= ab_addr_ext_next;
+															end
+		endcase
+		//
+		case (fsm_state)
+		
+			FSM_STATE_MULT_Q_N_RELOAD: begin
+				if (qn_addr_ext == {1'b0, bram_addr_last}) begin
+					s_addr	<= bram_addr_zero;
+					sn_addr	<= bram_addr_zero;
+				end
+				
+				if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
+					s_addr <= s_addr_next;
+					sn_addr <= sn_addr_next;
+				end
+
+				if (qn_addr_ext == bram_addr_ext_last) begin
+					s_addr <= bram_addr_zero;
+					sn_addr <= bram_addr_zero;
+				end
+			
+			end
+			
+			FSM_STATE_MULT_Q_N_FINAL,
+			FSM_STATE_SAVE_START,
+			FSM_STATE_SAVE_WRITE: begin
+				s_addr <= !s_addr_done ? s_addr_next : s_addr;
+				sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
+			end
+			
+		endcase
+		
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_AB_N_COEFF_START:		ab_addr_ext <= bram_addr_ext_zero;
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	ab_addr_ext <= ab_addr_ext_next;
+		endcase
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_Q_N_START:		q_addr <= bram_addr_zero;
+			FSM_STATE_MULT_Q_N_RELOAD:		q_addr <= !q_addr_done ? q_addr_next : q_addr;
+		endcase
+
+		//
+	end
+		
+	always @(posedge clk) begin
+		//
+		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
+			ab_wren <= shreg_done_latency_dly;
+			ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+		end else begin
+			ab_wren <= 1'b0;
+			ab_data_in <= 32'hXXXXXXXX;
+		end
+		//
+		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
+			q_wren <= shreg_done_latency_dly;
+			q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+		end else begin
+			q_wren <= 1'b0;
+			q_data_in <= 32'hXXXXXXXX;
+		end
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
+			qn_wren <= shreg_done_latency_dly;
+			qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+		end else begin
+			qn_wren <= 1'b0;
+			qn_data_in <= 32'hXXXXXXXX;
+		end		
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_START:	r_wren <= 1'b1;
+			FSM_STATE_SAVE_WRITE:	r_wren <= ~r_addr_done;
+			default:						r_wren <= 1'b0;
+		endcase
+		//
+	end
+	
+	
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_A_B_START,
+			FSM_STATE_MULT_AB_N_COEFF_START,
+			FSM_STATE_MULT_Q_N_START,
+			FSM_STATE_MULT_A_B_RELOAD,
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+			FSM_STATE_MULT_Q_N_RELOAD:
+				//
+				syst_cnt_load <= syst_cnt_zero;
+			
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH:
+				//
+				syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+				
+		endcase
+
+		
+		
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH: begin
+		
+			if (shreg_done_latency)	syst_cnt_unload <= syst_cnt_zero;
+			else if (shreg_now_unloading)
+				syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
+
+			end
+		endcase
+	
+		
+			//
+			// T and C_IN can be moved to a separate code block
+			//
+	always @(posedge clk) begin
+		//
+		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
+			//
+			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+				//
+				if (shreg_now_loading) begin
+					pe_a[j]		<= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
+					pe_b[j]		<= loader_dout[j];
+					//pe_t[j]		<= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
+					//pe_c_in[j]	<= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
+				end else begin
+					pe_a[j]		<= 32'hXXXXXXXX;				
+					pe_b[j]		<= 32'hXXXXXXXX;
+					//pe_t[j]		<= 32'hXXXXXXXX;
+					//pe_c_in[j]	<= 32'hXXXXXXXX;
+				end
+		//
+		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
+			//
+			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+				//
+				if (shreg_now_loading) begin
+					pe_a[j]		<= ab_data_out;
+					pe_b[j]		<= loader_dout[j];
+					//pe_t[j]		<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
+					//pe_c_in[j]	<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
+				end else begin
+					pe_a[j]		<= 32'hXXXXXXXX;				
+					pe_b[j]		<= 32'hXXXXXXXX;
+					//pe_t[j]		<= 32'hXXXXXXXX;
+					//pe_c_in[j]	<= 32'hXXXXXXXX;
+				end
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			//
+			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+				//
+				if (shreg_now_loading) begin
+					pe_a[j]		<= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
+					pe_b[j]		<= loader_dout[j];
+					//pe_t[j]		<= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
+					//pe_c_in[j]	<= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
+				end else begin
+					pe_a[j]		<= 32'hXXXXXXXX;				
+					pe_b[j]		<= 32'hXXXXXXXX;
+					//pe_t[j]		<= 32'hXXXXXXXX;
+					//pe_c_in[j]	<= 32'hXXXXXXXX;
+				end
+		//
+	
+		//
+	end
+		
+		
+		//
+		// Adder
+		//
+		/*
+		 * This adder is used to calculate S = AB + QN.
+		 *
+		 */
+	reg				add1_ce;					// clock enable
+	reg	[31: 0]	add1_s;					// sum output
+	wire				add1_c_in;				// carry input
+	wire	[31: 0]	add1_a;					// A-input
+	reg	[31: 0]	add1_b;					// B-input
+	reg				add1_c_in_mask;		// flag to not carry anything into the very first word
+	reg				add1_c_out;				// carry output
+	
+		/* add masking into carry feedback chain */
+	assign add1_c_in = add1_c_out & ~add1_c_in_mask;
+
+		/* mask carry for the very first word of N */
+	//always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+	
+	always @(posedge  clk)
+		//
+		if (add1_ce)
+			//
+			{add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in};
+	
+	assign add1_a = qn_data_in;
+	
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX;
+		else
+			add1_b <= 32'hXXXXXXXX;
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0;
+		else
+			add1_c_in_mask <= 1'b0;
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			add1_ce <= shreg_done_latency_dly;
+		else
+			add1_ce <= 1'b0;
+
+
+	assign s_data_in = add1_s;
+	assign sn_data_in = sub1_d;
+	
+	always @(posedge clk) begin
+		//
+		s_wren <= add1_ce;
+		sn_wren <= sub1_ce;
+	end
+		
+		
+		
+		//
+		// Subtractor
+		//
+		/*
+		 * This subtractor is used to calculate SN = S - N.
+		 *
+		 */
+	reg				sub1_ce;					// clock enable
+	reg	[31: 0]	sub1_d;					// difference output
+	wire				sub1_b_in;				// borrow input
+	wire	[31: 0]	sub1_a;					// A-input
+	reg	[31: 0]	sub1_b;					// B-input
+	reg				sub1_b_in_mask;		// flag to not borrow anything from the very first word
+	reg				sub1_b_out;				// borrow output
+	
+		/* add masking into borrow feedback chain */
+	assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
+	
+	always @(posedge  clk)
+		//
+		if (sub1_ce)
+			//
+			{sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
+	
+	assign sub1_a = add1_s;
+	
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
+		else
+			sub1_b <= 32'hXXXXXXXX;
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
+		else
+			sub1_b_in_mask <= 1'b0;
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
+		else
+			sub1_ce <= 1'b0;
+
+
+	assign s_data_in = add1_s;
+	
+	always @(posedge clk)
+		//
+		s_wren <= add1_ce;
+		
+		
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
+			flag_select_s <= sub1_b_out & ~add1_c_out;
+		
+
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_START,
+			FSM_STATE_SAVE_WRITE:
+				r_data_in <= flag_select_s ? s_data_out : sn_data_out;
+		endcase
+
+		
+			
+		//
+		// FSM Process
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
+		else						fsm_state <= fsm_next_state;
+	
+	
+		//
+		// FSM Transition Logic
+		//
+	always @* begin
+		//
+		fsm_next_state = FSM_STATE_STOP;
+		//
+		case (fsm_state)
+
+			FSM_STATE_IDLE:				if (ena_trig)				fsm_next_state = FSM_STATE_LOAD_B_START;
+												else							fsm_next_state = FSM_STATE_IDLE;
+			//
+			FSM_STATE_LOAD_B_START:											fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+			FSM_STATE_LOAD_B_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_B_WRITE;
+												else								fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+			FSM_STATE_LOAD_B_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_B_FINAL;
+												else							fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+			FSM_STATE_LOAD_B_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
+			//
+			FSM_STATE_LOAD_N_COEFF_START:											fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+			FSM_STATE_LOAD_N_COEFF_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
+												else								fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+			FSM_STATE_LOAD_N_COEFF_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
+												else							fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+			FSM_STATE_LOAD_N_COEFF_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_START;
+			//
+			FSM_STATE_LOAD_N_START:											fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+			FSM_STATE_LOAD_N_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_WRITE;
+												else								fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+			FSM_STATE_LOAD_N_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_FINAL;
+												else							fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+			FSM_STATE_LOAD_N_FINAL:										fsm_next_state = FSM_STATE_MULT_A_B_START;
+			//
+			FSM_STATE_MULT_A_B_START:									fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+			FSM_STATE_MULT_A_B_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
+												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+			FSM_STATE_MULT_A_B_RELOAD:	if (ab_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
+												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+			FSM_STATE_MULT_A_B_FINAL:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
+			//
+			FSM_STATE_MULT_AB_N_COEFF_START:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
+															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	if (q_addr_done)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
+															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+			FSM_STATE_MULT_AB_N_COEFF_FINAL:									fsm_next_state = FSM_STATE_MULT_Q_N_START;
+			//
+			FSM_STATE_MULT_Q_N_START:									fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+			FSM_STATE_MULT_Q_N_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
+															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+			FSM_STATE_MULT_Q_N_RELOAD:	if (qn_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
+															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+			FSM_STATE_MULT_Q_N_FINAL:									fsm_next_state = FSM_STATE_SAVE_START;
+			//
+			FSM_STATE_SAVE_START:										fsm_next_state = FSM_STATE_SAVE_WRITE;
+			FSM_STATE_SAVE_WRITE:	if (r_addr_done)				fsm_next_state = FSM_STATE_SAVE_FINAL;
+											else								fsm_next_state = FSM_STATE_SAVE_WRITE;
+			FSM_STATE_SAVE_FINAL:										fsm_next_state = FSM_STATE_STOP;
+			//
+			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
+
+		endcase
+		//
+	end
+
+
+endmodule
+
+//======================================================================
+// End of file
+//======================================================================

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.


More information about the Commits mailing list