[Cryptech-Commits] [core/math/modexpa7] branch systolic updated: Systolic multiplier simplified a bit: * passes testbench tests again * this time synthesizes fine (without major issues)

git at cryptech.is git at cryptech.is
Thu Jul 13 18:47:33 UTC 2017


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch systolic
in repository core/math/modexpa7.

The following commit(s) were added to refs/heads/systolic by this push:
     new 72a67f0  Systolic multiplier simplified a bit:  * passes testbench tests again  * this time synthesizes fine (without major issues)
72a67f0 is described below

commit 72a67f04a21ba4006c7b5bf38e01a3aa6592740f
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Thu Jul 13 21:38:53 2017 +0300

    Systolic multiplier simplified a bit:
     * passes testbench tests again
     * this time synthesizes fine (without major issues)
    
    List of things that need polishing in the future:
     * Parallelized operand loader can be reduced by a factor of 3
       to only store one operand at a time: it currently stores
       B, N_COEFF and N. After B is consumed, it can be overwritten
       with AB,  N_COEFF can be loaded sequentially the same way
       A is loaded. After that loader can be filled with Q while
       N will be loaded sequentially.
     * Turns out QN block memory is not needed at all. After we obtain
       the next word of QN, we immediately calculate SN. After that QN
       can be discarded, no need to store it.
     * Currently there are two wide memories T and PE_C_OUT. XST throws
       weird warnings about multi-port RAM before finally deciding
       to implement it using flip-flop. Those memories should be turned
       into FIFOs to simplify the design and not confuse XST.
---
 src/rtl/modexpa7_systolic_multiplier.v | 239 ++++++++++++++++++++++++++++++++-
 src/tb/tb_systolic_multiplier.v        |   1 -
 2 files changed, 234 insertions(+), 6 deletions(-)

diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
index 56e7be3..513b5aa 100644
--- a/src/rtl/modexpa7_systolic_multiplier.v
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -118,6 +118,10 @@ module modexpa7_systolic_multiplier #
 	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_RELOAD				= 8'h63;
 	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_FINAL				= 8'h64;
 	
+	localparam	[ 7: 0]	FSM_STATE_SAVE_START						= 8'h71;
+	localparam	[ 7: 0]	FSM_STATE_SAVE_WRITE						= 8'h72;
+	localparam	[ 7: 0]	FSM_STATE_SAVE_FINAL						= 8'h73;	
+	
 	localparam	[ 7: 0]	FSM_STATE_STOP								= 8'hFF;
 	
 		//
@@ -271,7 +275,7 @@ module modexpa7_systolic_multiplier #
 		
 		//
 		// Loader currently stores B, N_COEFF and N, it can be coded another way
-		// to initially stire B, then AB, then Q. Some memory can be saved thay way.
+		// to initially store B, then AB, then Q. Some memory can be saved thay way.
 		// Maybe later...
 		//
 		
@@ -324,6 +328,9 @@ module modexpa7_systolic_multiplier #
 	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext;
 	reg	[OPERAND_ADDR_WIDTH-1:0]	q_addr;
 	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	s_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_addr;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr;
 		
 		/* handy increment values */
 	wire	[OPERAND_ADDR_WIDTH-1:0]	a_addr_next			= a_addr       + 1'b1;
@@ -333,6 +340,9 @@ module modexpa7_systolic_multiplier #
 	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_next	= ab_addr_ext  + 1'b1;
 	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_next			= q_addr       + 1'b1;
 	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_next	= qn_addr_ext  + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_addr_next	= s_addr  + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_addr_next	= sn_addr  + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next	= r_addr  + 1'b1;
 	
 		/* handy stop flags */
 	wire	a_addr_done			= (a_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
@@ -342,6 +352,9 @@ module modexpa7_systolic_multiplier #
 	wire	ab_addr_ext_done	= (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
 	wire	q_addr_done			= (q_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
 	wire	qn_addr_ext_done	= (qn_addr_ext     == bram_addr_ext_last)     ? 1'b1 : 1'b0;
+	wire	s_addr_done	= (s_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	sn_addr_done	= (sn_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
+	wire	r_addr_done	= (r_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
 
 		/* delayed B address */
 	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
@@ -358,9 +371,16 @@ module modexpa7_systolic_multiplier #
 	assign b_bram_addr = b_addr;
 	assign n_coeff_bram_addr = n_coeff_addr;
 	assign n_bram_addr = n_addr;
+	assign r_bram_addr = r_addr;
 
 
 		//
+		// Flag
+		//
+	reg	flag_select_s;
+	
+	
+		//
 		// Memory Address Control Logic
 		//
 	always @(posedge clk) begin
@@ -375,6 +395,20 @@ module modexpa7_systolic_multiplier #
 			FSM_STATE_LOAD_N_SHIFT:				n_addr <= n_addr_next;
 		endcase
 		//
+		case (fsm_state)
+			FSM_STATE_MULT_Q_N_RELOAD: 
+				if (qn_addr_ext == {1'b0, bram_addr_last})
+					n_addr		<= bram_addr_zero;
+				else if (qn_addr_ext > {1'b0, bram_addr_last})
+					n_addr		<= n_addr_next;
+			
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_START:	r_addr <= bram_addr_zero;
+			FSM_STATE_SAVE_WRITE:	r_addr <= r_addr_next;
+		endcase
+		//
 		case (fsm_next_state)
 			FSM_STATE_MULT_A_B_START:	a_addr <= bram_addr_zero;
 			FSM_STATE_MULT_A_B_RELOAD:	a_addr <= !a_addr_done ? a_addr_next : a_addr;
@@ -391,16 +425,28 @@ module modexpa7_systolic_multiplier #
 	reg	[31: 0]	ab_data_in;
 	reg	[31: 0]	q_data_in;
 	reg	[31: 0]	qn_data_in;
+	wire	[31: 0]	s_data_in;
+	wire	[31: 0]	sn_data_in;
+	reg	[31: 0]	r_data_in;
 
 		/* memory outputs */
 	wire	[31: 0]	ab_data_out;
 	wire	[31: 0]	q_data_out;
 	wire	[31: 0]	qn_data_out;
+	wire	[31: 0]	s_data_out;
+	wire	[31: 0]	sn_data_out;
 
 		/* write enables */
 	reg	ab_wren;
 	reg	q_wren;
 	reg	qn_wren;
+	reg	s_wren;
+	reg	sn_wren;
+	reg	r_wren;
+	
+		/* map */
+	assign r_bram_in = r_data_in;
+	assign r_bram_wr = r_wren;
 
 	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
 	bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
@@ -411,6 +457,12 @@ module modexpa7_systolic_multiplier #
 	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
 	bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
 
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
+
+	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
+
 	
 		//
 		// Wide Operand Loader
@@ -646,13 +698,46 @@ module modexpa7_systolic_multiplier #
 		case (fsm_state)
 			FSM_STATE_MULT_A_B_START:				ab_addr_ext		<= bram_addr_ext_zero;
 			FSM_STATE_MULT_AB_N_COEFF_START:		q_addr			<= bram_addr_zero;
-			FSM_STATE_MULT_Q_N_START:				qn_addr_ext		<= bram_addr_ext_zero;
+			FSM_STATE_MULT_Q_N_START: begin		qn_addr_ext		<= bram_addr_ext_zero;
+															ab_addr_ext		<= bram_addr_ext_zero;															
+															end
 			
 			FSM_STATE_MULT_A_B_RELOAD:				ab_addr_ext		<= ab_addr_ext_next;
 			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	q_addr			<= q_addr_next;
-			FSM_STATE_MULT_Q_N_RELOAD:				qn_addr_ext		<= qn_addr_ext_next;
+			FSM_STATE_MULT_Q_N_RELOAD: begin		qn_addr_ext		<= qn_addr_ext_next;
+															ab_addr_ext		<= ab_addr_ext_next;
+															end
+		endcase
+		//
+		case (fsm_state)
+		
+			FSM_STATE_MULT_Q_N_RELOAD: begin
+				if (qn_addr_ext == {1'b0, bram_addr_last}) begin
+					s_addr		<= bram_addr_zero;
+					sn_addr	<= bram_addr_zero;
+				end
+				
+				if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
+					s_addr <= s_addr_next;
+					sn_addr <= sn_addr_next;
+				end
+
+				if (qn_addr_ext == bram_addr_ext_last) begin
+					s_addr <= bram_addr_zero;
+					sn_addr <= bram_addr_zero;
+				end
+			
+			end
+			
+			FSM_STATE_MULT_Q_N_FINAL,
+			FSM_STATE_SAVE_START,
+			FSM_STATE_SAVE_WRITE: begin
+				s_addr <= !s_addr_done ? s_addr_next : s_addr;
+				sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
+			end
 			
 		endcase
+		
 		//
 		case (fsm_next_state)
 			FSM_STATE_MULT_AB_N_COEFF_START:		ab_addr_ext <= bram_addr_ext_zero;
@@ -692,7 +777,12 @@ module modexpa7_systolic_multiplier #
 			qn_wren <= 1'b0;
 			qn_data_in <= 32'hXXXXXXXX;
 		end		
-
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_START:	r_wren <= 1'b1;
+			FSM_STATE_SAVE_WRITE:	r_wren <= ~r_addr_done;
+			default:						r_wren <= 1'b0;
+		endcase
 		//
 	end
 	
@@ -816,6 +906,140 @@ module modexpa7_systolic_multiplier #
 		//
 	end
 		
+		
+		//
+		// Adder
+		//
+		/*
+		 * This adder is used to calculate S = AB + QN.
+		 *
+		 */
+	reg				add1_ce;					// clock enable
+	reg	[31: 0]	add1_s;					// sum output
+	wire				add1_c_in;				// carry input
+	wire	[31: 0]	add1_a;					// A-input
+	reg	[31: 0]	add1_b;					// B-input
+	reg				add1_c_in_mask;		// flag to not carry anything into the very first word
+	reg				add1_c_out;				// carry output
+	
+		/* add masking into carry feedback chain */
+	assign add1_c_in = add1_c_out & ~add1_c_in_mask;
+
+		/* mask carry for the very first word of N */
+	//always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+	
+	always @(posedge  clk)
+		//
+		if (add1_ce)
+			//
+			{add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in};
+	
+	assign add1_a = qn_data_in;
+	
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX;
+		else
+			add1_b <= 32'hXXXXXXXX;
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0;
+		else
+			add1_c_in_mask <= 1'b0;
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			add1_ce <= shreg_done_latency_dly;
+		else
+			add1_ce <= 1'b0;
+
+
+	assign s_data_in = add1_s;
+	assign sn_data_in = sub1_d;
+	
+	always @(posedge clk) begin
+		//
+		s_wren <= add1_ce;
+		sn_wren <= sub1_ce;
+	end
+		
+		
+		
+		//
+		// Subtractor
+		//
+		/*
+		 * This subtractor is used to calculate SN = S - N.
+		 *
+		 */
+	reg				sub1_ce;					// clock enable
+	reg	[31: 0]	sub1_d;					// difference output
+	wire				sub1_b_in;				// borrow input
+	wire	[31: 0]	sub1_a;					// A-input
+	reg	[31: 0]	sub1_b;					// B-input
+	reg				sub1_b_in_mask;		// flag to not borrow anything from the very first word
+	reg				sub1_b_out;				// borrow output
+	
+		/* add masking into borrow feedback chain */
+	assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
+	
+	always @(posedge  clk)
+		//
+		if (sub1_ce)
+			//
+			{sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
+	
+	assign sub1_a = add1_s;
+	
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
+		else
+			sub1_b <= 32'hXXXXXXXX;
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
+		else
+			sub1_b_in_mask <= 1'b0;
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+			sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
+		else
+			sub1_ce <= 1'b0;
+
+
+	assign s_data_in = add1_s;
+	
+	always @(posedge clk)
+		//
+		s_wren <= add1_ce;
+		
+		
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
+			flag_select_s <= sub1_b_out & ~add1_c_out;
+		
+
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_START,
+			FSM_STATE_SAVE_WRITE:
+				r_data_in <= flag_select_s ? s_data_out : sn_data_out;
+		endcase
+
+		
 			
 		//
 		// FSM Process
@@ -878,7 +1102,12 @@ module modexpa7_systolic_multiplier #
 															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
 			FSM_STATE_MULT_Q_N_RELOAD:	if (qn_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
 															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
-			FSM_STATE_MULT_Q_N_FINAL:									fsm_next_state = FSM_STATE_STOP;
+			FSM_STATE_MULT_Q_N_FINAL:									fsm_next_state = FSM_STATE_SAVE_START;
+			//
+			FSM_STATE_SAVE_START:										fsm_next_state = FSM_STATE_SAVE_WRITE;
+			FSM_STATE_SAVE_WRITE:	if (r_addr_done)				fsm_next_state = FSM_STATE_SAVE_FINAL;
+											else								fsm_next_state = FSM_STATE_SAVE_WRITE;
+			FSM_STATE_SAVE_FINAL:										fsm_next_state = FSM_STATE_STOP;
 			//
 			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
 
diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v
index 9df492e..33d1e01 100644
--- a/src/tb/tb_systolic_multiplier.v
+++ b/src/tb/tb_systolic_multiplier.v
@@ -273,7 +273,6 @@ module tb_systolic_multiplier;
 
 				b = ab_modulo;										// prepare for next round
 
-				#1000000;
 			end		
 		
 				// final step, display results

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.


More information about the Commits mailing list