[Cryptech-Commits] [core/math/modexpa7] 04/04: * made separate file for low-level settings * turned crazy triple multiplier array into one array with input mux

git at cryptech.is git at cryptech.is
Mon Jul 10 18:46:42 UTC 2017


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch systolic
in repository core/math/modexpa7.

commit 71b75290bf2ade9a4022bad93dc80bfb77f87f40
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Jul 10 15:31:25 2017 +0300

     * made separate file for low-level settings
     * turned crazy triple multiplier array into one array with input mux
---
 src/rtl/modexpa7_settings.v                        |    6 +
 src/rtl/modexpa7_systolic_multiplier.v             | 1298 ++++++++++----------
 src/rtl/pe/modexpa7_adder32.v                      |    2 +-
 ...evel_settings.v => modexpa7_primitive_switch.v} |    1 +
 src/rtl/pe/modexpa7_subtractor32.v                 |    2 +-
 src/rtl/pe/modexpa7_systolic_pe.v                  |    2 +-
 src/tb/tb_systolic_multiplier.v                    |    3 +-
 7 files changed, 670 insertions(+), 644 deletions(-)

diff --git a/src/rtl/modexpa7_settings.v b/src/rtl/modexpa7_settings.v
new file mode 100644
index 0000000..0ec6978
--- /dev/null
+++ b/src/rtl/modexpa7_settings.v
@@ -0,0 +1,6 @@
+localparam	SYSTOLIC_PE_LATENCY		= 4;
+
+localparam	SYSTOLIC_CNTR_WIDTH		= OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
+localparam	SYSTOLIC_ARRAY_LENGTH	= 2 ** SYSTOLIC_ARRAY_POWER;
+localparam	SYSTOLIC_NUM_CYCLES		= 2 ** SYSTOLIC_CNTR_WIDTH;
+
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
index cb1c716..56e7be3 100644
--- a/src/rtl/modexpa7_systolic_multiplier.v
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -40,16 +40,16 @@ module modexpa7_systolic_multiplier #
 	(
 			//
 			// This sets the address widths of memory buffers. Internal data
-			// width is 32 bits, so for e.g. 1024-bit operands buffers must store
-			// 1024 / 32 = 32 words, and these need 5-bit address bus, because
-			// 2 ** 5 = 32.
+			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
+			// 2048 / 32 = 64 words, and these need 5-bit address bus, because
+			// 2 ** 6 = 64.
 			//
-		parameter	OPERAND_ADDR_WIDTH		= 5,
+		parameter	OPERAND_ADDR_WIDTH		= 4,
 		
 			//
-			// This sets the width of the systolic cycle counter. TODO: Explain.
+			// Explain.
 			//
-		parameter	SYSTOLIC_ARRAY_POWER		= 3
+		parameter	SYSTOLIC_ARRAY_POWER		= 2
 	)
 	(
 		input											clk,
@@ -72,801 +72,819 @@ module modexpa7_systolic_multiplier #
 		output	[                32-1:0]	r_bram_in,
 		output										r_bram_wr,
 
-		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words
+		input		[OPERAND_ADDR_WIDTH-1:0]	ab_num_words
 	);
 	
-	
+		
 		//
-		// Constants
+		// Include Settings
 		//
-	localparam	SYSTOLIC_CNTR_WIDTH		= OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
-	localparam	SYSTOLIC_ARRAY_LENGTH	= 2 ** SYSTOLIC_ARRAY_POWER;
-	localparam	SYSTOLIC_NUM_CYCLES		= 2 ** SYSTOLIC_CNTR_WIDTH;
-
-	localparam	SYSTOLIC_PE_LATENCY		= 4;
-	
+	`include "pe/modexpa7_primitive_switch.v"
+	`include "modexpa7_settings.v"
+		
 
 		//
 		// FSM Declaration
 		//
-	localparam	[ 3: 0]	FSM_STATE_IDLE					= 4'd0;
-	localparam	[ 3: 0]	FSM_STATE_INIT_ZERO_ADDR	= 4'd1;
-	localparam	[ 3: 0]	FSM_STATE_INIT_NEXT_ADDR	= 4'd2;
-	localparam	[ 3: 0]	FSM_STATE_INIT_LAST_ADDR	= 4'd3;
-	localparam	[ 3: 0]	FSM_STATE_PIPE_CRUNCH		= 4'd4;
-	localparam	[ 3: 0]	FSM_STATE_PIPE_RELOAD		= 4'd5;
-	localparam	[ 3: 0]	FSM_STATE_SAVE_ZERO_ADDR	= 4'd6;
-	localparam	[ 3: 0]	FSM_STATE_SAVE_NEXT_ADDR	= 4'd7;
-	localparam	[ 3: 0]	FSM_STATE_SAVE_LAST_ADDR	= 4'd8;
-	localparam	[ 3: 0]	FSM_STATE_STOP					= 4'd9;
-	
-	reg	[ 3: 0]	fsm_state = FSM_STATE_IDLE;
-	reg	[ 3: 0]	fsm_next_state;
+	localparam	[ 7: 0]	FSM_STATE_IDLE								= 8'h00;
 
-	
-		//
-		// Enable Delay (Trigger)
-		//
-   reg ena_dly = 1'b0;
-   wire ena_trig = ena && !ena_dly;
-   always @(posedge clk) ena_dly <= ena;		
+	localparam	[ 7: 0]	FSM_STATE_LOAD_B_START					= 8'h11;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_B_SHIFT					= 8'h12;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_B_WRITE					= 8'h13;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_B_FINAL					= 8'h14;
 
-		
-		//
-		// Parameters Latch
-		//
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_num_words_latch;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_START			= 8'h21;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_SHIFT			= 8'h22;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_WRITE			= 8'h23;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_FINAL			= 8'h24;
 
-	always @(posedge clk)
-		//
-		if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR)
-			n_num_words_latch <= n_num_words;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_START					= 8'h31;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_SHIFT					= 8'h32;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_WRITE					= 8'h33;
+	localparam	[ 7: 0]	FSM_STATE_LOAD_N_FINAL					= 8'h34;
 
+	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_START				= 8'h41;
+	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_CRUNCH				= 8'h42;
+	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_RELOAD				= 8'h43;
+	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_FINAL				= 8'h44;
 
-		//
-		// Addresses
-		//
-	localparam	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
-	wire			[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last = n_num_words_latch;
+	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_START		= 8'h51;
+	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_CRUNCH		= 8'h52;
+	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_RELOAD		= 8'h53;
+	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_FINAL		= 8'h54;
+
+	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_START				= 8'h61;
+	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_CRUNCH				= 8'h62;
+	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_RELOAD				= 8'h63;
+	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_FINAL				= 8'h64;
 	
+	localparam	[ 7: 0]	FSM_STATE_STOP								= 8'hFF;
 	
 		//
-		// BRAM Addresses
+		// FSM State / Next State
 		//
-	reg	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_reg;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr_reg;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_reg;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_reg;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_reg;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr_reg;
+	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
+	reg	[ 7: 0]	fsm_next_state;
 
-	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr = s_bram_addr_reg;
-	
-	reg	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_dly;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_dly;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_dly;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_dly;
-	
-	wire	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_next       = b_bram_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr_next       = a_bram_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_next       = n_bram_addr + 1'b1;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_next       = s_bram_addr + 1'b1;
+
+		//
+		// Enable Delay and Trigger
+		//
+   reg ena_dly = 1'b0;
 	
-	wire										b_bram_addr_done = 
-		(b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+		/* delay enable by one clock cycle */
+   always @(posedge clk) ena_dly <= ena;
 
-	wire										s_bram_addr_done = 
-		(s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+		/* trigger new operation when enable goes high */
+   wire ena_trig = ena && !ena_dly;
+	
 	
-	assign b_bram_addr = b_bram_addr_reg;
-	assign a_bram_addr = a_bram_addr_reg;
-	assign n_coeff_bram_addr = n_coeff_bram_addr_reg;
-	assign n_bram_addr = n_bram_addr_reg;
-	assign r_bram_addr = r_bram_addr_reg;
+		//
+		// Ready Flag Logic
+		//
+	reg rdy_reg = 1'b1;
+	assign rdy = rdy_reg;
 
-	always @(posedge clk) b_bram_addr_dly <= b_bram_addr;
-	always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr;
-	always @(posedge clk) n_bram_addr_dly <= n_bram_addr;
-	always @(posedge clk) s_bram_addr_dly <= s_bram_addr;
+   always @(posedge clk or negedge rst_n)
+		
+			/* reset flag */
+		if (rst_n == 1'b0) rdy_reg <= 1'b1;
+		else begin
+		
+				/* clear flag when operation is started */
+			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
+			
+				/* set flag after operation is finished */
+			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
+			
+		end
+		
 		
-	always @(posedge clk)
 		//
-		case (fsm_next_state)
-			FSM_STATE_INIT_ZERO_ADDR:	b_bram_addr_reg <= bram_addr_zero;
-			FSM_STATE_INIT_NEXT_ADDR:	b_bram_addr_reg <= b_bram_addr_next;
-		endcase
-
-	always @(posedge clk)
-		case (fsm_next_state)
-			FSM_STATE_SAVE_ZERO_ADDR:	s_bram_addr_reg <= bram_addr_zero;
-			FSM_STATE_SAVE_NEXT_ADDR:	s_bram_addr_reg <= s_bram_addr_next;
-		endcase
-
-	always @(posedge clk)
+		// Parameters Latch
 		//
-		case (fsm_next_state)
-			FSM_STATE_INIT_LAST_ADDR:	a_bram_addr_reg <= bram_addr_zero;
-			FSM_STATE_PIPE_RELOAD:		a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr;
-		endcase
+	reg	[OPERAND_ADDR_WIDTH-1:0]	ab_num_words_latch;
 
+		/* save number of words in a and b when new operation starts */
 	always @(posedge clk)
 		//
-		case (fsm_next_state)
-			FSM_STATE_INIT_ZERO_ADDR:	n_coeff_bram_addr_reg <= bram_addr_zero;
-			FSM_STATE_INIT_NEXT_ADDR:	n_coeff_bram_addr_reg <= n_coeff_bram_addr_next;
-		endcase
-
-
-		
-		
+		if (fsm_next_state == FSM_STATE_LOAD_B_START)
+			ab_num_words_latch <= ab_num_words;
+			
+			
 		//
-		// Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles...
+		// Systolic Cycle Counters
 		//
-	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
-
-	reg	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_lsb;
-	reg	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_msb;
+		
+		/* handy values */
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
 	
-	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_lsb_next =
-		{pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]};
+		/* counters */
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init;
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load;
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload;
+		
+		/* handy increment values */
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init_next		= syst_cnt_init   + 1'b1;
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_next		= syst_cnt_load   + 1'b1;
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload_next		= syst_cnt_unload + 1'b1;
 
-	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_msb_next =
-		{pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]};
+		/* handy stop flags */
+	wire										syst_cnt_init_done		= (syst_cnt_init   == syst_cnt_last) ? 1'b1 : 1'b0;
+	wire										syst_cnt_load_done		= (syst_cnt_load   == syst_cnt_last) ? 1'b1 : 1'b0;
+	wire										syst_cnt_unload_done		= (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
 
-	wire										pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY];
-	wire										pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY];
+		/* delayed load counter */
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_dly;
+	always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
 
-	always @(posedge clk)
-		//
-		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
-			//
-			case (fsm_state)
-				FSM_STATE_INIT_LAST_ADDR,
-				FSM_STATE_PIPE_RELOAD:		pe_latency_ab_lsb <= pe_latency_start;
-				FSM_STATE_PIPE_CRUNCH:		pe_latency_ab_lsb <= pe_latency_ab_lsb_done ?
-														pe_latency_ab_lsb : pe_latency_ab_lsb_next;
-			endcase
 
 		//
-		// Buffers
+		// Multiplier Iteration Counter
 		//
-	integer i, j;
-
-	reg	[31: 0]	b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
-	reg	[31: 0]	n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
-	reg	[31: 0]	n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+		
+		/* handy values */
+	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
+	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
 	
-	always @(posedge clk)
+		/* counter */
+	reg	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt;
+	
+		/* handy increment value and stop flag */
+	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_next = mult_cnt + 1'b1;
+	wire										mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
+			
+			
 		//
-		case (fsm_state)
-			FSM_STATE_INIT_ZERO_ADDR:
-				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						b_buf[i][j] <= 32'd0;
-
-			FSM_STATE_INIT_NEXT_ADDR,
-			FSM_STATE_INIT_LAST_ADDR:
-				b_buf[b_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][b_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= b_bram_out;
-		endcase
-
-	always @(posedge clk)
+		// Initialization Counter Control Logic
+		//
+	always @(posedge clk) begin
 		//
 		case (fsm_state)
-			FSM_STATE_INIT_ZERO_ADDR:
-				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						n_coeff_buf[i][j] <= 32'd0;
-
-			FSM_STATE_INIT_NEXT_ADDR,
-			FSM_STATE_INIT_LAST_ADDR:
-				n_coeff_buf[n_coeff_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_coeff_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_coeff_bram_out;
+			FSM_STATE_LOAD_B_START,
+			FSM_STATE_LOAD_N_COEFF_START,
+			FSM_STATE_LOAD_N_START:				mult_cnt <= mult_cnt_zero;
+			
+			FSM_STATE_LOAD_B_SHIFT,
+			FSM_STATE_LOAD_N_COEFF_SHIFT,
+			FSM_STATE_LOAD_N_SHIFT:				mult_cnt <= mult_cnt_next;
 		endcase
-
-	always @(posedge clk)
 		//
 		case (fsm_state)
-			FSM_STATE_INIT_ZERO_ADDR:
-				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						n_buf[i][j] <= 32'd0;
-
-			FSM_STATE_INIT_NEXT_ADDR,
-			FSM_STATE_INIT_LAST_ADDR:
-				n_buf[n_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_bram_out;
+			FSM_STATE_LOAD_B_START,
+			FSM_STATE_LOAD_N_COEFF_START,
+			FSM_STATE_LOAD_N_START:				syst_cnt_init <= syst_cnt_zero;
+			
+			FSM_STATE_LOAD_B_WRITE,
+			FSM_STATE_LOAD_N_COEFF_WRITE,
+			FSM_STATE_LOAD_N_WRITE:				syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
 		endcase
-
-		
-	
-		
+		//
+	end
 	
 	
 		//
-		// Cycle Counters
+		// Operand Loader
 		//
-	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_ab;
-	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_q;
-	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_qn;
-	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_s;
-	
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt;
-	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_dly[SYSTOLIC_PE_LATENCY-1:0];
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_latency = syst_cnt_dly[SYSTOLIC_PE_LATENCY-1];
-	
-	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
-	
-	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_half = {1'b0, n_num_words};
 	
-	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_last = {n_num_words, 1'b1};
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-
-	wire										mult_cnt_ab_done = (mult_cnt_ab == mult_cnt_last) ? 1'b1 : 1'b0;
-	wire										mult_cnt_q_done = (mult_cnt_q == mult_cnt_last) ? 1'b1 : 1'b0;
-	wire										mult_cnt_qn_done = (mult_cnt_qn == mult_cnt_last) ? 1'b1 : 1'b0;
-	wire										mult_cnt_s_done = (mult_cnt_s == mult_cnt_last) ? 1'b1 : 1'b0;
+		/*
+		 * Explain how parallelized loader works here...
+		 *
+		 */
 	
-	wire										syst_cnt_done = (syst_cnt == syst_cnt_last) ? 1'b1 : 1'b0;
-
-	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_ab_next = mult_cnt_ab + 1'b1;
-	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_q_next = mult_cnt_q + 1'b1;
-	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_qn_next = mult_cnt_qn + 1'b1;
-	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_s_next = mult_cnt_s + 1'b1;
+		/* loader banks */
+	localparam	[ 1: 0]	LOADER_ADDR_MSB_B				= 2'd0;
+	localparam	[ 1: 0]	LOADER_ADDR_MSB_N_COEFF		= 2'd1;
+	localparam	[ 1: 0]	LOADER_ADDR_MSB_N				= 2'd2;
 	
-	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_next = syst_cnt_done ? syst_cnt_zero : syst_cnt + 1'b1;
-
+		/* loader input */
+	reg	[                  2-1:0]	loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];	
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
+	reg										loader_wren    [0:SYSTOLIC_ARRAY_LENGTH-1];
+	reg	[                 32-1:0]	loader_din     [0:SYSTOLIC_ARRAY_LENGTH-1];
 	
-	always @(posedge clk)
+		/* loader output */
+	wire	[                 32-1:0]	loader_dout    [0:SYSTOLIC_ARRAY_LENGTH-1];
+			
+		/* generate parallelized loader */
+		
 		//
-		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
-			//
-			case (fsm_state)
-				FSM_STATE_INIT_LAST_ADDR,
-				FSM_STATE_PIPE_RELOAD:		syst_cnt <= syst_cnt_zero;
-				FSM_STATE_PIPE_CRUNCH:		syst_cnt <= syst_cnt_done ? syst_cnt : syst_cnt_next;
-			endcase
-
-	always @(posedge clk)
+		// Loader currently stores B, N_COEFF and N, it can be coded another way
+		// to initially stire B, then AB, then Q. Some memory can be saved thay way.
+		// Maybe later...
 		//
-		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
-			//
-			case (fsm_state)
-				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_ab <= mult_cnt_zero;
-				FSM_STATE_PIPE_RELOAD:		mult_cnt_ab <= mult_cnt_ab_done ? mult_cnt_ab : mult_cnt_ab_next;
-			endcase
-
-	always @(posedge clk)
+		
+	genvar i;
+	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
 		//
-		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+		begin : gen_bram_1rw_readfirst_loader
 			//
-			case (fsm_state)
-				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_q <= mult_cnt_zero;
-				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_ab > mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next;
-			endcase
+			bram_1rw_readfirst #
+			(
+				.MEM_WIDTH		(32),
+				.MEM_ADDR_BITS	(SYSTOLIC_CNTR_WIDTH + 2)
+			)
+			bram_loader
+			(
+				.clk		(clk),
+				.a_addr	({loader_addr_msb[i], loader_addr_lsb[i]}),
+				.a_wr		(loader_wren[i]),
+				.a_in		(loader_din[i]),
+				.a_out	(loader_dout[i])
+			);
+			//
+		end
+		//
+	endgenerate
+	
 
-	always @(posedge clk)
 		//
-		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
-			//
-			case (fsm_state)
-				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_qn <= mult_cnt_zero;
-				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next;
-			endcase
-		
-	always @(posedge clk)
+		// Block Memory Addresses
 		//
-		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
-			//
-			case (fsm_state)
-				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_s <= mult_cnt_zero;
-				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next;
-			endcase
 		
+		/*
+		 * Explain why there are two memory sizes.
+		 *
+		 */
+		
+		/* the very first addresses */
+	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero			= {      {OPERAND_ADDR_WIDTH{1'b0}}};
+	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_zero	= {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
+	
+		/* the very last addresses */
+	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last     = {ab_num_words_latch};
+	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_last = {ab_num_words_latch, 1'b1};
+
+		/* address registers */
+	reg	[OPERAND_ADDR_WIDTH-1:0]	a_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr;
+	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	q_addr;
+	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext;
 		
+		/* handy increment values */
+	wire	[OPERAND_ADDR_WIDTH-1:0]	a_addr_next			= a_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next			= b_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_next	= n_coeff_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next			= n_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_next	= ab_addr_ext  + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_next			= q_addr       + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_next	= qn_addr_ext  + 1'b1;
+	
+		/* handy stop flags */
+	wire	a_addr_done			= (a_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	b_addr_done			= (b_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	n_coeff_addr_done	= (n_coeff_addr  == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	n_addr_done			= (n_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	ab_addr_ext_done	= (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
+	wire	q_addr_done			= (q_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	qn_addr_ext_done	= (qn_addr_ext     == bram_addr_ext_last)     ? 1'b1 : 1'b0;
+
+		/* delayed B address */
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
+	always @(posedge clk) b_addr_dly <= b_addr;
+
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_dly;
+	always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
+
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr_dly;
+	always @(posedge clk) n_addr_dly <= n_addr;
+				
+		/* map registers to top-level ports */
+	assign a_bram_addr = a_addr;
+	assign b_bram_addr = b_addr;
+	assign n_coeff_bram_addr = n_coeff_addr;
+	assign n_bram_addr = n_addr;
+
+
+		//
+		// Memory Address Control Logic
+		//
 	always @(posedge clk) begin
-		syst_cnt_dly[0] <= syst_cnt;
-		for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
-			syst_cnt_dly[i] <= syst_cnt_dly[i-1];
+		//
+		case (fsm_next_state)
+			FSM_STATE_LOAD_B_START:				b_addr <= bram_addr_zero;
+			FSM_STATE_LOAD_N_COEFF_START:		n_coeff_addr <= bram_addr_zero;
+			FSM_STATE_LOAD_N_START:				n_addr <= bram_addr_zero;
+			
+			FSM_STATE_LOAD_B_SHIFT:				b_addr <= b_addr_next;
+			FSM_STATE_LOAD_N_COEFF_SHIFT:		n_coeff_addr <= n_coeff_addr_next;
+			FSM_STATE_LOAD_N_SHIFT:				n_addr <= n_addr_next;
+		endcase
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_A_B_START:	a_addr <= bram_addr_zero;
+			FSM_STATE_MULT_A_B_RELOAD:	a_addr <= !a_addr_done ? a_addr_next : a_addr;
+		endcase
+		//
 	end
 	
+	
 		//
-		// Systolic Array
+		// Internal Memories
 		//
-	wire	[31: 0]	mul_ab_p[SYSTOLIC_ARRAY_LENGTH-1:0];
-	wire	[31: 0]	mul_ab_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
 
-	wire	[31: 0]	mul_q_p[SYSTOLIC_ARRAY_LENGTH-1:0];
-	wire	[31: 0]	mul_q_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+		/* memory inputs */
+	reg	[31: 0]	ab_data_in;
+	reg	[31: 0]	q_data_in;
+	reg	[31: 0]	qn_data_in;
 
-	wire	[31: 0]	mul_qn_p[SYSTOLIC_ARRAY_LENGTH-1:0];
-	wire	[31: 0]	mul_qn_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
-	
-	wire	[31: 0]	mul_ab_a	= (mult_cnt_ab <= mult_cnt_half) ? a_bram_out : 32'd0;
-	reg	[31: 0]	mul_q_a_int;
-	reg	[31: 0]	mul_q_a;
-	reg	[31: 0]	mul_qn_a_int;
-	reg	[31: 0]	mul_qn_a;
-	
-	reg	[31: 0]	t_ab[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
-	reg	[31: 0]	c_ab_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+		/* memory outputs */
+	wire	[31: 0]	ab_data_out;
+	wire	[31: 0]	q_data_out;
+	wire	[31: 0]	qn_data_out;
 
-	reg	[31: 0]	t_q[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
-	reg	[31: 0]	c_q_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+		/* write enables */
+	reg	ab_wren;
+	reg	q_wren;
+	reg	qn_wren;
 
-	reg	[31: 0]	t_qn[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
-	reg	[31: 0]	c_qn_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+	bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
 
-	genvar syst;
-	generate for (syst=0; syst<SYSTOLIC_ARRAY_LENGTH; syst=syst+1)
-		begin : gen_mul
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
 	
-			modexpa7_systolic_pe mul_ab_inst
-			(
-				.clk		(clk),
-				.a			(mul_ab_a),
-				.b			(b_buf[syst_cnt][syst]),
-				.t			(t_ab[syst_cnt][syst]),
-				.c_in		(c_ab_in[syst_cnt][syst]),
-				
-				.p			(mul_ab_p[syst]),
-				.c_out	(mul_ab_c_out[syst])
-			);
-			
-			modexpa7_systolic_pe mul_q_inst
-			(
-				.clk		(clk),
-				.a			(mul_q_a),
-				.b			(n_coeff_buf[syst_cnt][syst]),
-				.t			(t_q[syst_cnt][syst]),
-				.c_in		(c_q_in[syst_cnt][syst]),
-				
-				.p			(mul_q_p[syst]),
-				.c_out	(mul_q_c_out[syst])
-			);
-			
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+	bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
 
-			modexpa7_systolic_pe mul_qn_inst
-			(
-				.clk		(clk),
-				.a			(mul_qn_a),
-				.b			(n_buf[syst_cnt][syst]),
-				.t			(t_qn[syst_cnt][syst]),
-				.c_in		(c_qn_in[syst_cnt][syst]),
-				
-				.p			(mul_qn_p[syst]),
-				.c_out	(mul_qn_c_out[syst])
-			);
-			
-		end
-	endgenerate
 	
 		//
-		// c_ab
-		//
-	always @(posedge clk)
+		// Wide Operand Loader
 		//
-		case (fsm_state)
-			
-			FSM_STATE_INIT_LAST_ADDR:
-				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						c_ab_in[i][j] <= 32'd0;
-						
-			FSM_STATE_PIPE_CRUNCH:
-				if (pe_latency_ab_lsb_done)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						c_ab_in[syst_cnt_latency][j] <= mul_ab_c_out[j];
-		endcase
+	integer j;
 	
-		//
-		// c_q
-		//
-	always @(posedge clk)
-		//
-		case (fsm_state)
-			
-			FSM_STATE_INIT_LAST_ADDR:
-				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						c_q_in[i][j] <= 32'd0;
-						
-			FSM_STATE_PIPE_CRUNCH:
-				if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero))
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						c_q_in[syst_cnt_latency][j] <= mul_q_c_out[j];
-		endcase
-
-		//
-		// c_qn
-		//
+		/* shift logic */
 	always @(posedge clk)
 		//
 		case (fsm_state)
-			
-			FSM_STATE_INIT_LAST_ADDR:
-				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						c_qn_in[i][j] <= 32'd0;
-						
-			FSM_STATE_PIPE_CRUNCH:
-				if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero))
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						c_qn_in[syst_cnt_latency][j] <= mul_qn_c_out[j];
-		endcase
+			//
+			FSM_STATE_LOAD_B_SHIFT: begin
 		
-		//
-		// t_ab
-		//
-	always @(posedge clk)
-		//
-		case (fsm_state)
+						/* update the rightmost part of loader buffer */
+				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
+				
+						/* shift the loader buffer to the left */
+				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_din[j-1] <= loader_din[j];
+					
+			end
+			//
+			FSM_STATE_LOAD_N_COEFF_SHIFT: begin
 		
-			FSM_STATE_INIT_LAST_ADDR:
-				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						t_ab[i][j] <= 32'd0;
-						
-			FSM_STATE_PIPE_CRUNCH:
-				if (pe_latency_ab_lsb_done) begin
-					if (syst_cnt_latency > syst_cnt_zero)
-						t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0];
-					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						t_ab[syst_cnt_latency][j-1] <= mul_ab_p[j];
-				end
+						/* update the rightmost part of loader buffer */
+				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
+				
+						/* shift the loader buffer to the left */
+				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_din[j-1] <= loader_din[j];
+					
+			end
+			//
+			FSM_STATE_LOAD_N_SHIFT: begin
+		
+						/* update the rightmost part of loader buffer */
+				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
 				
+						/* shift the loader buffer to the left */
+				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_din[j-1] <= loader_din[j];
+					
+			end					
+			//
 		endcase
+		
 
-
-		//
-		// t_q
-		//
+		/* write enable logic */
 	always @(posedge clk)
 		//
-		case (fsm_state)
+		case (fsm_next_state)
 		
-			FSM_STATE_INIT_LAST_ADDR:
-				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						t_q[i][j] <= 32'd0;
-						
-			FSM_STATE_PIPE_CRUNCH:
-				if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) begin
-					if (syst_cnt_latency > syst_cnt_zero)
-						t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0];
-					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						t_q[syst_cnt_latency][j-1] <= mul_q_p[j];
-				end
-				
+			FSM_STATE_LOAD_B_WRITE,
+			FSM_STATE_LOAD_N_COEFF_WRITE,
+			FSM_STATE_LOAD_N_WRITE:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_wren[j] <= 1'b1;
+					
+			default:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_wren[j] <= 1'b0;
+					
 		endcase
 
-
-		//
-		// t_qn
-		//
-	always @(posedge clk)
+		/* loader address update logic */
+	always @(posedge clk) begin
 		//
 		case (fsm_state)
 		
-			FSM_STATE_INIT_LAST_ADDR:
-				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
-					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						t_qn[i][j] <= 32'd0;
-						
-			FSM_STATE_PIPE_CRUNCH:
-				if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) begin
-					if (syst_cnt_latency > syst_cnt_zero)
-						t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0];
-					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
-						t_qn[syst_cnt_latency][j-1] <= mul_qn_p[j];
-				end
-				
+			FSM_STATE_LOAD_B_START,
+			FSM_STATE_LOAD_N_COEFF_START,
+			FSM_STATE_LOAD_N_START:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_lsb[j] <= syst_cnt_zero;
+					
+			FSM_STATE_LOAD_B_WRITE,
+			FSM_STATE_LOAD_N_COEFF_WRITE,
+			FSM_STATE_LOAD_N_WRITE:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
+					
+		endcase
+		//
+		case (fsm_next_state)
+			FSM_STATE_MULT_A_B_START,
+			FSM_STATE_MULT_AB_N_COEFF_START,
+			FSM_STATE_MULT_Q_N_START,
+			FSM_STATE_MULT_A_B_RELOAD,
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+			FSM_STATE_MULT_Q_N_RELOAD:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_lsb[j] <= syst_cnt_zero;
+													
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
 		endcase
+		//
+		case (fsm_next_state)
+		
+			FSM_STATE_LOAD_B_START,
+			FSM_STATE_MULT_A_B_START:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
+
+			FSM_STATE_LOAD_N_COEFF_START,
+			FSM_STATE_MULT_AB_N_COEFF_START:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
+					
+			FSM_STATE_LOAD_N_START,
+			FSM_STATE_MULT_Q_N_START:
+				//
+				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+					loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
 
+		endcase
 		//
-		// Latency 2
+	end
+	
+	
 		//
-	always @(posedge clk)
+		// Systolic Array of Processing Elements
 		//
-		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
-			//
-			case (fsm_state)
-				FSM_STATE_INIT_LAST_ADDR,
-				FSM_STATE_PIPE_RELOAD:		pe_latency_ab_msb <= pe_latency_start;
-				FSM_STATE_PIPE_CRUNCH:		if (syst_cnt_done)
-					pe_latency_ab_msb <= pe_latency_ab_msb_done ?
-														pe_latency_ab_msb : pe_latency_ab_msb_next;
-			endcase
-
+	reg	[31: 0]	pe_a    [0:SYSTOLIC_ARRAY_LENGTH-1];
+	reg	[31: 0]	pe_b    [0:SYSTOLIC_ARRAY_LENGTH-1];
+	reg	[31: 0]	pe_t    [0:SYSTOLIC_ARRAY_LENGTH-1];
+	reg	[31: 0]	pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1];
+	wire	[31: 0]	pe_p    [0:SYSTOLIC_ARRAY_LENGTH-1];
+	wire	[31: 0]	pe_c_out[0:SYSTOLIC_ARRAY_LENGTH-1];
+	
 
 		//
-		// Adder
+		// These can be turned into a FIFO (maybe later?)...
 		//
-	reg				pe_add_ce;
-	reg	[31: 0]	pe_add_a0;
-	reg	[31: 0]	pe_add_a1;
-	reg	[31: 0]	pe_add_a2;
-	reg	[31: 0]	pe_add_b0;
+	reg	[31: 0]	pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
+	reg	[31: 0]	pe_t_mem    [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
 
-	reg				pe_add_c_in;
-	wire	[31: 0]	pe_add_s;
-	wire				pe_add_c_out;
+	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
+		begin : modexpa7_systolic_pe_multiplier		
+			modexpa7_systolic_pe systolic_pe_inst
+			(
+				.clk		(clk),
+				.a			(pe_a[i]),
+				.b			(pe_b[i]),
+				.t			(pe_t[i]),
+				.c_in		(pe_c_in[i]),
+				.p			(pe_p[i]),
+				.c_out	(pe_c_out[i])
+			);
+		end
+	endgenerate
 
-	reg				pe_sub_ce;
-	reg	[31: 0]	pe_sub_a0;
-	reg	[31: 0]	pe_sub_b0;
 
-	reg				pe_sub_b_in;
-	wire	[31: 0]	pe_sub_d;
-	wire				pe_sub_b_out;
-	
-	always @(posedge clk)
-		pe_add_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done;
-
-	always @(posedge clk)
-		pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero);
+		
+			
+			//
+			// Shift Registers
+			//
+	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_load;
+	reg	[SYSTOLIC_PE_LATENCY  :0]	shreg_latency;
+	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_unload;
 
-	always @(posedge clk)
-		//
-		if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done)
-			pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out;
+	wire	shreg_done_load = shreg_load[syst_cnt_last];
+	wire	shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
+	wire	shreg_done_unload = shreg_unload[syst_cnt_last];
 
-	always @(posedge clk)
-		//
-		if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero))
-			pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out;
-	
+	reg										shreg_now_loading;
+	reg										shreg_now_latency;
+	reg										shreg_now_unloading;
 	
-	modexpa7_adder32 pe_add_inst
-	(
-		.clk		(clk),
-		.ce		(pe_add_ce),
-		.a			(pe_add_a2),
-		.b			(pe_add_b0),
-		.c_in		(pe_add_c_in),
-		.s			(pe_add_s),
-		.c_out	(pe_add_c_out)
-	);
-
-	modexpa7_subtractor32 pe_sub_inst
-	(
-		.clk		(clk),
-		.ce		(pe_sub_ce),
-		.a			(pe_sub_a0),
-		.b			(pe_sub_b0),
-		.b_in		(pe_sub_b_in),
-		.d			(pe_sub_d),
-		.b_out	(pe_sub_b_out)
-	);
+	reg										shreg_done_latency_dly;
 	
 	always @(posedge clk)
-		//
-		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin
-			pe_add_a0 <= mul_ab_p[0];
-			pe_add_a1 <= pe_add_a0;
-			pe_add_a2 <= pe_add_a1;
-		end
+		shreg_done_latency_dly <= shreg_done_latency;
 
 	always @(posedge clk)
 		//
-		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
-			pe_sub_a0 <= pe_add_s;
-
-	always @(posedge clk)
-		//
-		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
-			pe_add_b0 <= mul_qn_p[0];
-	
-	always @(posedge clk)
-		//
-		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
-			pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out;
-	
-	
-	always @(posedge clk)
-		//
-		case (fsm_next_state)
-			FSM_STATE_INIT_ZERO_ADDR:	n_bram_addr_reg <= bram_addr_zero;
-			FSM_STATE_INIT_NEXT_ADDR:	n_bram_addr_reg <= n_bram_addr_next;
-			FSM_STATE_PIPE_RELOAD: begin
-				if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero;
-				if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next;
+		case (fsm_state)
+			//
+			FSM_STATE_MULT_A_B_START,
+			FSM_STATE_MULT_AB_N_COEFF_START,
+			FSM_STATE_MULT_Q_N_START,
+			FSM_STATE_MULT_A_B_RELOAD,
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+			FSM_STATE_MULT_Q_N_RELOAD: begin
+				shreg_now_loading	<= 1'b1;
+				shreg_now_latency <= 1'b1;
+				shreg_now_unloading <= 1'b0;
+				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
+				shreg_latency	<= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
+				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+			end
+			//
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH: begin
+				shreg_load		<= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
+				shreg_latency	<= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
+				shreg_unload	<= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
+				
+				if (shreg_done_load) shreg_now_loading <= 1'b0;
+				if (shreg_done_latency) shreg_now_latency <= 1'b0;
+				if (shreg_done_latency) shreg_now_unloading <= 1'b1;
+				else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
+				
+			end
+			//
+			default: begin
+				shreg_now_loading <= 1'b0;
+				shreg_now_latency <= 1'b0;
+				shreg_now_unloading <= 1'b0;
 			end
+			//
 		endcase
 		
 		
+		
+		
+		
+	always @(posedge clk) begin
 		//
-		// Ready Flag Logic
-		//
-	reg rdy_reg = 1'b1;
-	assign rdy = rdy_reg;
-
-   always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0)	rdy_reg	<= 1'b1;
-		else begin
-			if (fsm_state == FSM_STATE_IDLE)		rdy_reg <= ~ena_trig;
-			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;
-		end
-	
-
-		//
-		//
-		//
-	always @(posedge clk)
-		//
-		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
-			mul_q_a_int <= mul_ab_p[0];
-
-	always @(posedge clk)
+		case (fsm_state)
+			FSM_STATE_MULT_A_B_START:				ab_addr_ext		<= bram_addr_ext_zero;
+			FSM_STATE_MULT_AB_N_COEFF_START:		q_addr			<= bram_addr_zero;
+			FSM_STATE_MULT_Q_N_START:				qn_addr_ext		<= bram_addr_ext_zero;
+			
+			FSM_STATE_MULT_A_B_RELOAD:				ab_addr_ext		<= ab_addr_ext_next;
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	q_addr			<= q_addr_next;
+			FSM_STATE_MULT_Q_N_RELOAD:				qn_addr_ext		<= qn_addr_ext_next;
+			
+		endcase
 		//
-		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
-			mul_qn_a_int <= mul_q_p[0];
-
-	always @(posedge clk)
+		case (fsm_next_state)
+			FSM_STATE_MULT_AB_N_COEFF_START:		ab_addr_ext <= bram_addr_ext_zero;
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	ab_addr_ext <= ab_addr_ext_next;
+		endcase
 		//
-		if (fsm_state == FSM_STATE_PIPE_RELOAD)
-			mul_q_a <= mul_q_a_int;	// TODO: Add masking! Maybe not needed after all?..
+		case (fsm_next_state)
+			FSM_STATE_MULT_Q_N_START:		q_addr <= bram_addr_zero;
+			FSM_STATE_MULT_Q_N_RELOAD:		q_addr <= !q_addr_done ? q_addr_next : q_addr;
+		endcase
 
-	always @(posedge clk)
-		//
-		if (fsm_state == FSM_STATE_PIPE_RELOAD)
-			mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0;
-	
-		//
-		// Debug
-		//
-	//always @(posedge clk) begin
-		//
-		//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
-			//$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]);
-		//
-		//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
-			//$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]);
-		//
-		//if (fsm_state == FSM_STATE_PIPE_RELOAD)
-			//$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s);
 		//
-		//if (fsm_state == FSM_STATE_PIPE_RELOAD)
-			//$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d);
-		//
-	//end
-		
+	end
 		
-	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_rd;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_wr;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_wr_next = s_bram_addr_wr + 1'b1;
-	reg										s_bram_en;
-	
-	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_rd;
-	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_wr;
-	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1;
-	reg										sn_bram_en;
-	
-	assign s_bram_addr_rd = s_bram_addr;
-	assign sn_bram_addr_rd = s_bram_addr;
-	
-	wire	[31: 0]	s_bram_din;
-	wire	[31: 0]	s_bram_dout;
-	
-	wire	[31: 0]	sn_bram_din;
-	wire	[31: 0]	sn_bram_dout;
-	
-	assign s_bram_din = pe_add_s;
-	assign sn_bram_din = pe_sub_d;
-	
-	always @(posedge clk)
+	always @(posedge clk) begin
 		//
-		s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half);
-
-	always @(posedge clk)
+		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
+			ab_wren <= shreg_done_latency_dly;
+			ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+		end else begin
+			ab_wren <= 1'b0;
+			ab_data_in <= 32'hXXXXXXXX;
+		end
 		//
-		sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half);
-	
-	always @(posedge clk) begin
+		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
+			q_wren <= shreg_done_latency_dly;
+			q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+		end else begin
+			q_wren <= 1'b0;
+			q_data_in <= 32'hXXXXXXXX;
+		end
 		//
-		if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero;
-		if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next;
-	end
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
+			qn_wren <= shreg_done_latency_dly;
+			qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+		end else begin
+			qn_wren <= 1'b0;
+			qn_data_in <= 32'hXXXXXXXX;
+		end		
 
-	always @(posedge clk) begin
 		//
-		if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero;
-		if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next;
 	end
 	
-	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_s (.clk(clk),
-		.a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(),
-		.b_addr(s_bram_addr_rd), .b_out(s_bram_dout));
-
-	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
-	bram_sn (.clk(clk),
-		.a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(),
-		.b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout));
-		
-		
-	reg	r_bram_en;
 	
 	always @(posedge clk)
 		//
-		case (fsm_state)
-			FSM_STATE_SAVE_ZERO_ADDR,
-			FSM_STATE_SAVE_NEXT_ADDR:	r_bram_en <= 1'b1;
-			default:							r_bram_en <= 1'b0;
+		case (fsm_next_state)
+			FSM_STATE_MULT_A_B_START,
+			FSM_STATE_MULT_AB_N_COEFF_START,
+			FSM_STATE_MULT_Q_N_START,
+			FSM_STATE_MULT_A_B_RELOAD,
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+			FSM_STATE_MULT_Q_N_RELOAD:
+				//
+				syst_cnt_load <= syst_cnt_zero;
 			
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH:
+				//
+				syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+				
 		endcase
+
 		
 		
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH: begin
 		
-	reg	r_bram_wr_reg;
-	
-	assign r_bram_wr = r_bram_wr_reg;
+			if (shreg_done_latency)	syst_cnt_unload <= syst_cnt_zero;
+			else if (shreg_now_unloading)
+				syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
+
+			end
+		endcase
 	
 	always @(posedge clk)
 		//
-		r_bram_wr_reg <= r_bram_en;
-		
-		
-	wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out;
-		
+		case (fsm_state)
+			FSM_STATE_MULT_A_B_CRUNCH,
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+			FSM_STATE_MULT_Q_N_CRUNCH: begin
 		
-	reg	[31: 0]	r_bram_in_reg;
-	
-	assign r_bram_in = r_bram_in_reg;
+				if (shreg_now_unloading)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						pe_c_out_mem[syst_cnt_unload][j] <= pe_c_out[j];
+						
+				if (shreg_now_unloading) begin
+				
+					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						pe_t_mem[syst_cnt_unload][j-1] <= pe_p[j];
+						
+					if (syst_cnt_unload > syst_cnt_zero)
+						pe_t_mem[syst_cnt_unload-1'b1][SYSTOLIC_ARRAY_LENGTH-1] <= pe_p[0];
+					else
+						pe_t_mem[syst_cnt_last][SYSTOLIC_ARRAY_LENGTH-1] <= 32'd0;
+					
+				end
+			end
+		endcase
 
-		always @(posedge clk)
+		
 			//
-			if (r_bram_en)
-				r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout;
-	
-	always @(posedge clk)
+			// T and C_IN can be moved to a separate code block
+			//
+	always @(posedge clk) begin
+		//
+		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
+			//
+			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+				//
+				if (shreg_now_loading) begin
+					pe_a[j]		<= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
+					pe_b[j]		<= loader_dout[j];
+					pe_t[j]		<= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
+					pe_c_in[j]	<= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
+				end else begin
+					pe_a[j]		<= 32'hXXXXXXXX;				
+					pe_b[j]		<= 32'hXXXXXXXX;
+					pe_t[j]		<= 32'hXXXXXXXX;
+					pe_c_in[j]	<= 32'hXXXXXXXX;
+				end
+		//
+		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
+			//
+			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+				//
+				if (shreg_now_loading) begin
+					pe_a[j]		<= ab_data_out;
+					pe_b[j]		<= loader_dout[j];
+					pe_t[j]		<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
+					pe_c_in[j]	<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
+				end else begin
+					pe_a[j]		<= 32'hXXXXXXXX;				
+					pe_b[j]		<= 32'hXXXXXXXX;
+					pe_t[j]		<= 32'hXXXXXXXX;
+					pe_c_in[j]	<= 32'hXXXXXXXX;
+				end
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			//
+			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+				//
+				if (shreg_now_loading) begin
+					pe_a[j]		<= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
+					pe_b[j]		<= loader_dout[j];
+					pe_t[j]		<= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
+					pe_c_in[j]	<= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
+				end else begin
+					pe_a[j]		<= 32'hXXXXXXXX;				
+					pe_b[j]		<= 32'hXXXXXXXX;
+					pe_t[j]		<= 32'hXXXXXXXX;
+					pe_c_in[j]	<= 32'hXXXXXXXX;
+				end
 		//
-		if (r_bram_en)
-			r_bram_addr_reg <= s_bram_addr_dly;
-	
 	
 		//
-		// FSM Transition Logic
+	end
+		
+			
+		//
+		// FSM Process
 		//
 	always @(posedge clk or negedge rst_n)
 		//
 		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
 		else						fsm_state <= fsm_next_state;
 	
+	
+		//
+		// FSM Transition Logic
+		//
 	always @* begin
 		//
 		fsm_next_state = FSM_STATE_STOP;
 		//
 		case (fsm_state)
-		
-			FSM_STATE_IDLE:				if (ena_trig)				fsm_next_state = FSM_STATE_INIT_ZERO_ADDR;
+
+			FSM_STATE_IDLE:				if (ena_trig)				fsm_next_state = FSM_STATE_LOAD_B_START;
 												else							fsm_next_state = FSM_STATE_IDLE;
-												
-			FSM_STATE_INIT_ZERO_ADDR:									fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
-			
-			FSM_STATE_INIT_NEXT_ADDR:	if (b_bram_addr_done)	fsm_next_state = FSM_STATE_INIT_LAST_ADDR;
-												else							fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
-												
-			FSM_STATE_INIT_LAST_ADDR:									fsm_next_state = FSM_STATE_PIPE_CRUNCH;
-			
-			FSM_STATE_PIPE_CRUNCH:		if (syst_cnt_done)		fsm_next_state = pe_latency_ab_msb_done ?
-																					FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH;
-												else							fsm_next_state = FSM_STATE_PIPE_CRUNCH;
-
-			FSM_STATE_PIPE_RELOAD:		if (mult_cnt_s_done)		fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR;
-												else							fsm_next_state = FSM_STATE_PIPE_CRUNCH;
-												
-			FSM_STATE_SAVE_ZERO_ADDR:									fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
-			
-			FSM_STATE_SAVE_NEXT_ADDR:	if (s_bram_addr_done)	fsm_next_state = FSM_STATE_SAVE_LAST_ADDR;
-												else							fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
-			
-			FSM_STATE_SAVE_LAST_ADDR:									fsm_next_state = FSM_STATE_STOP;
-			
+			//
+			FSM_STATE_LOAD_B_START:											fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+			FSM_STATE_LOAD_B_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_B_WRITE;
+												else								fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+			FSM_STATE_LOAD_B_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_B_FINAL;
+												else							fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+			FSM_STATE_LOAD_B_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
+			//
+			FSM_STATE_LOAD_N_COEFF_START:											fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+			FSM_STATE_LOAD_N_COEFF_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
+												else								fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+			FSM_STATE_LOAD_N_COEFF_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
+												else							fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+			FSM_STATE_LOAD_N_COEFF_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_START;
+			//
+			FSM_STATE_LOAD_N_START:											fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+			FSM_STATE_LOAD_N_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_WRITE;
+												else								fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+			FSM_STATE_LOAD_N_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_FINAL;
+												else							fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+			FSM_STATE_LOAD_N_FINAL:										fsm_next_state = FSM_STATE_MULT_A_B_START;
+			//
+			FSM_STATE_MULT_A_B_START:									fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+			FSM_STATE_MULT_A_B_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
+												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+			FSM_STATE_MULT_A_B_RELOAD:	if (ab_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
+												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+			FSM_STATE_MULT_A_B_FINAL:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
+			//
+			FSM_STATE_MULT_AB_N_COEFF_START:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+			FSM_STATE_MULT_AB_N_COEFF_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
+															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	if (q_addr_done)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
+															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+			FSM_STATE_MULT_AB_N_COEFF_FINAL:									fsm_next_state = FSM_STATE_MULT_Q_N_START;
+			//
+			FSM_STATE_MULT_Q_N_START:									fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+			FSM_STATE_MULT_Q_N_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
+															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+			FSM_STATE_MULT_Q_N_RELOAD:	if (qn_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
+															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+			FSM_STATE_MULT_Q_N_FINAL:									fsm_next_state = FSM_STATE_STOP;
+			//
 			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
-			
+
 		endcase
-	end
+		//
+	end
 
 
 endmodule
diff --git a/src/rtl/pe/modexpa7_adder32.v b/src/rtl/pe/modexpa7_adder32.v
index ad296b1..04f8a18 100644
--- a/src/rtl/pe/modexpa7_adder32.v
+++ b/src/rtl/pe/modexpa7_adder32.v
@@ -51,7 +51,7 @@ module modexpa7_adder32
 		//
 		// Include Primitive Selector
 		//
-	`include "modexpa7_lowlevel_settings.v"
+	`include "modexpa7_primitive_switch.v"
 
 
 		//
diff --git a/src/rtl/pe/modexpa7_lowlevel_settings.v b/src/rtl/pe/modexpa7_primitive_switch.v
similarity index 95%
rename from src/rtl/pe/modexpa7_lowlevel_settings.v
rename to src/rtl/pe/modexpa7_primitive_switch.v
index 93f5f34..d38069b 100644
--- a/src/rtl/pe/modexpa7_lowlevel_settings.v
+++ b/src/rtl/pe/modexpa7_primitive_switch.v
@@ -12,4 +12,5 @@
 `define SUBTRACTOR32_PRIMITIVE	subtractor32_generic
 `define SYSTOLIC_PE_PRIMITIVE	systolic_pe_generic
 
+
 `endif
diff --git a/src/rtl/pe/modexpa7_subtractor32.v b/src/rtl/pe/modexpa7_subtractor32.v
index 75b9c13..a43d670 100644
--- a/src/rtl/pe/modexpa7_subtractor32.v
+++ b/src/rtl/pe/modexpa7_subtractor32.v
@@ -51,7 +51,7 @@ module modexpa7_subtractor32
 		//
 		// Include Primitive Selector
 		//
-	`include "modexpa7_lowlevel_settings.v"
+	`include "modexpa7_primitive_switch.v"
 
 
 		//
diff --git a/src/rtl/pe/modexpa7_systolic_pe.v b/src/rtl/pe/modexpa7_systolic_pe.v
index 22e6874..b284134 100644
--- a/src/rtl/pe/modexpa7_systolic_pe.v
+++ b/src/rtl/pe/modexpa7_systolic_pe.v
@@ -51,7 +51,7 @@ module modexpa7_systolic_pe
 		//
 		// Include Primitive Selector
 		//
-	`include "modexpa7_lowlevel_settings.v"
+	`include "modexpa7_primitive_switch.v"
 
 
 		//
diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v
index 21e319a..9df492e 100644
--- a/src/tb/tb_systolic_multiplier.v
+++ b/src/tb/tb_systolic_multiplier.v
@@ -176,7 +176,7 @@ module tb_systolic_multiplier;
 		.r_bram_in				(core_r_data), 
 		.r_bram_wr				(core_r_wren), 
 		
-		.n_num_words			(n_num_words)
+		.ab_num_words			(n_num_words)
 	);
 
 
@@ -273,6 +273,7 @@ module tb_systolic_multiplier;
 
 				b = ab_modulo;										// prepare for next round
 
+				#1000000;
 			end		
 		
 				// final step, display results



More information about the Commits mailing list