[Cryptech-Commits] [core/math/modexpa7] 02/02: Wrote top-level module. 4096-bit core with 16-tap systolic array synthesizes just fine: 10% slices 8% block memory 33% DSPs

git at cryptech.is git at cryptech.is
Sun Jul 23 21:13:14 UTC 2017


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch systolic
in repository core/math/modexpa7.

commit b33f595c014250072e9d787057293ef685eab5f3
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Sun Jul 23 04:10:47 2017 +0300

    Wrote top-level module. 4096-bit core with 16-tap systolic array synthesizes just fine:
     10% slices
      8% block memory
     33% DSPs
---
 src/rtl/modexpa7_top.v                 | 481 +++++++++++++++++++++++++++++++++
 src/rtl/pe/modexpa7_primitive_switch.v |   2 +-
 2 files changed, 482 insertions(+), 1 deletion(-)

diff --git a/src/rtl/modexpa7_top.v b/src/rtl/modexpa7_top.v
new file mode 100644
index 0000000..0c4eabe
--- /dev/null
+++ b/src/rtl/modexpa7_top.v
@@ -0,0 +1,481 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module modexpa7_top #
+	(
+		parameter OPERAND_ADDR_WIDTH		= 7,
+		parameter SYSTOLIC_ARRAY_POWER	= 4
+	)
+	(
+		input											clk,
+		input											rst_n,
+		
+		input											init,
+		output										ready,
+
+		input											next,
+		output										valid,
+
+		input		[OPERAND_ADDR_WIDTH-1:0]	modulus_num_words,
+		input		[OPERAND_ADDR_WIDTH+4:0]	exponent_num_bits,
+
+		input 										bus_cs,
+		input 										bus_we,
+		input		[OPERAND_ADDR_WIDTH+1:0]	bus_addr,
+		input		[                32-1:0]	bus_data_wr,
+		output 	[                32-1:0]	bus_data_rd
+	);
+	
+	
+		/*
+		 * FSM Declaration
+		 */
+		 
+	localparam	[ 2: 0]	FSM_STATE_IDLE					= 3'd0;
+	
+	localparam	[ 2: 0]	FSM_STATE_PRECALC_START		= 3'd1;
+	localparam	[ 2: 0]	FSM_STATE_PRECALC_CRUNCH	= 3'd2;
+	localparam	[ 2: 0]	FSM_STATE_PRECALC_FINAL		= 3'd3;
+		
+	localparam	[ 2: 0]	FSM_STATE_EXPONENT_START	= 3'd4;
+	localparam	[ 2: 0]	FSM_STATE_EXPONENT_CRUNCH	= 3'd5;
+	localparam	[ 2: 0]	FSM_STATE_EXPONENT_FINAL	= 3'd6;
+	
+	localparam	[ 7: 0]	FSM_STATE_STOP					= 3'd7;
+	
+	
+		/*
+		 * FSM State / Next State
+		 */
+		 
+	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
+	reg	[ 7: 0]	fsm_next_state;
+
+
+		/*
+		 * Enable Delay (Trigger)
+		 */
+		 
+   reg init_dly = 1'b0;
+   reg next_dly = 1'b0;
+
+		// delay init and next by one clock cycle
+   always @(posedge clk) init_dly <= init;
+	always @(posedge clk) next_dly <= next;
+
+		// trigger new operation when one of the control inputs goes from low to high
+   wire init_trig = init && !init_dly;
+   wire next_trig = next && !next_dly;
+	
+	
+		/*
+		 * Ready and Valid Flags Logic
+		 */
+		 
+	reg ready_reg = 1'b0;
+	reg valid_reg = 1'b0;
+	
+	assign ready = ready_reg;
+	assign valid = valid_reg;
+
+		// ready flag logic
+   always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)							ready_reg <= 1'b0;	// reset flag to default state
+		else case (fsm_state)
+			FSM_STATE_IDLE:	if (init_trig)		ready_reg <= 1'b0;	// clear flag when operation is started
+			FSM_STATE_STOP:	if (!ready_reg)	ready_reg <= 1'b1;	// set flag after operation is finished
+		endcase
+
+		// valid flag logic
+   always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)							valid_reg <= 1'b0;	// reset flag to default state
+		else case (fsm_state)
+			FSM_STATE_IDLE:	if (next_trig)		valid_reg <= 1'b0;	// clear flag when operation is started
+			FSM_STATE_STOP:	if (!valid_reg)	valid_reg <= 1'b1;	// set flag after operation is finished
+		endcase
+
+	
+		/*
+		 * Parameters Latch
+		 */
+	reg	[OPERAND_ADDR_WIDTH-1:0]	modulus_num_words_latch;
+	reg	[OPERAND_ADDR_WIDTH+4:0]	exponent_num_bits_latch;
+
+		// save number of words in modulus when pre-calculation has been triggered,
+		// i.e. user has apparently loaded a new modulus into the core
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PRECALC_START)
+			modulus_num_words_latch <= modulus_num_words;
+
+		// save number of bits in exponent when exponentiation has been triggered,
+		// i.e. user has loaded a new message into the core and wants exponentiate
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_EXPONENT_START)
+			exponent_num_bits_latch <= exponent_num_bits;
+
+	
+		/*
+		 * Split bus address into bank/word parts.
+		 */
+	wire	[                 2 - 1 : 0]	bus_addr_bank = bus_addr[OPERAND_ADDR_WIDTH+1:OPERAND_ADDR_WIDTH];
+	wire	[OPERAND_ADDR_WIDTH - 1 : 0]	bus_addr_word = bus_addr[OPERAND_ADDR_WIDTH-1:0];
+	
+	
+		/*
+		 * Define bank offsets.
+		 */
+	localparam	[ 1: 0]	BANK_MODULUS	= 2'b00;	// 0
+	localparam	[ 1: 0]	BANK_MESSAGE	= 2'b01;	// 1
+	localparam	[ 1: 0]	BANK_EXPONENT	= 2'b10;	// 2
+	localparam	[ 1: 0]	BANK_RESULT		= 2'b11;	// 3
+	
+	
+		/*
+		 * Instantiate user-accessible memories.
+		 *
+		 * We have four block memories: N for modulus, M for message, D for exponent
+		 * and R for result. Memories N, M and D and writeable from the user's side,
+		 * memory R is writeable from the core's side and is read-only by user.
+		 *
+		 * Note, that the core does squaring and multiplication simultaneously, so
+		 * there are two identical systolic multipliers inside. It's better to have two
+		 * copies of modulus to give router some freeding in placing the multipliers,
+		 * that's why there are actually two identical block memories N1 and N2 instead of N.
+		 * User reads from the first one, but writes to both of them. Note that the synthesis
+		 * tool might get too clever and find out that N1 and N2 are identical and decide
+		 * to throw one of them away, use (* KEEP="TRUE" *) or something like that then.
+		 *
+		 * We also need N3 and N4, because during pre-computation F and N_COEFF are calculated
+		 * at the same time, so we need two more copies of modulus to allow different words
+		 * of it to be read at the same time.
+		 */
+	
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_n1_addr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_n2_addr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_n3_addr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_n4_addr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_m_addr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_d_addr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_r_addr;
+	
+	wire	[                32-1:0]	core_n1_data;
+	wire	[                32-1:0]	core_n2_data;
+	wire	[                32-1:0]	core_n3_data;
+	wire	[                32-1:0]	core_n4_data;
+	wire	[                32-1:0]	core_m_data;
+	wire	[                32-1:0]	core_d_data;
+	wire	[                32-1:0]	core_r_data;
+	
+	wire	[                32-1:0]	user_n_data;
+	wire	[                32-1:0]	user_m_data;
+	wire	[                32-1:0]	user_d_data;
+	wire	[                32-1:0]	user_r_data;
+	
+	wire										core_r_wren;
+	wire										user_n_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MODULUS);
+	wire										user_m_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MESSAGE);
+	wire										user_d_wren = bus_cs && bus_we && (bus_addr_bank == BANK_EXPONENT);
+	
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_n1 (.clk(clk),
+		.a_addr(bus_addr_word), .a_out(user_n_data), .a_wr(user_n_wren), .a_in(bus_data_wr), 
+		.b_addr(core_n1_addr),  .b_out(core_n1_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_n2 (.clk(clk),
+		.a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr), 
+		.b_addr(core_n2_addr),  .b_out(core_n2_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_n3 (.clk(clk),
+		.a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr), 
+		.b_addr(core_n3_addr),  .b_out(core_n3_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_n4 (.clk(clk),
+		.a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr), 
+		.b_addr(core_n4_addr),  .b_out(core_n4_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_m (.clk(clk),
+		.a_addr(bus_addr_word), .a_out(user_m_data), .a_wr(user_m_wren), .a_in(bus_data_wr), 
+		.b_addr(core_m_addr),   .b_out(core_m_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_d (.clk(clk),
+		.a_addr(bus_addr_word), .a_out(user_d_data), .a_wr(user_d_wren), .a_in(bus_data_wr), 
+		.b_addr(core_d_addr), .b_out(core_d_data));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_r (.clk(clk),
+		.a_addr(core_r_addr), .a_out(), .a_wr(core_r_wren), .a_in(core_r_data), 
+		.b_addr(bus_addr_word), .b_out(user_r_data));
+
+		
+		/*
+		 * Instantiate internal memories.
+		 *
+		 * We have two block memories: F for Montgomery factor and N_COEFF for modulus-dependent
+		 * coefficient, they are written to during pre-calculation and read from during exponentiation.
+		 *
+		 * Note, that there are actually two identical block memories N_COEFF1 and N_COEFF2 instead of
+		 * just one N_COEFF, read the explanation above. F is only used by one of the multipliers, so
+		 * we don't need F1 and F2.
+		 */
+
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_f_addr_wr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_f_addr_rd;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_n_coeff_addr_wr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_n_coeff1_addr_rd;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	core_n_coeff2_addr_rd;
+	
+	wire	[                32-1:0]	core_f_data_wr;
+	wire	[                32-1:0]	core_f_data_rd;
+	wire	[                32-1:0]	core_n_coeff_data_wr;
+	wire	[                32-1:0]	core_n_coeff1_data_rd;
+	wire	[                32-1:0]	core_n_coeff2_data_rd;
+		
+	wire										core_f_wren;
+	wire										core_n_coeff_wren;
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_f (.clk(clk),
+		.a_addr(core_f_addr_wr), .a_out(), .a_wr(core_f_wren), .a_in(core_f_data_wr),
+		.b_addr(core_f_addr_rd), .b_out(core_f_data_rd));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_n_coeff1 (.clk(clk),
+		.a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr),
+		.b_addr(core_n_coeff1_addr_rd), .b_out(core_n_coeff1_data_rd));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_n_coeff2 (.clk(clk),
+		.a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr),
+		.b_addr(core_n_coeff2_addr_rd), .b_out(core_n_coeff2_data_rd));
+		
+
+		/*
+		 * Montgomery factor calculation module.
+		 */
+	(* EQUIVALENT_REGISTER_REMOVAL="NO" *)
+	reg	precalc_f_ena = 1'b0;
+	wire	precalc_r_rdy;
+	
+	modexpa7_factor #
+	(
+		.OPERAND_ADDR_WIDTH	(OPERAND_ADDR_WIDTH)
+	)
+	precalc_f
+	(
+		.clk				(clk),
+		.rst_n			(rst_n),
+
+		.ena				(precalc_f_ena),
+		.rdy				(precalc_r_rdy),
+
+		.n_bram_addr	(core_n3_addr),
+		.f_bram_addr	(core_f_addr_wr),
+
+		.n_bram_out		(core_n3_data),
+
+		.f_bram_in		(core_f_data_wr),
+		.f_bram_wr		(core_f_wren),
+
+		.n_num_words	(modulus_num_words_latch)
+	);
+	
+	
+		/*
+		 * Modulus-depentent coefficient calculation module.
+		 */
+	(* EQUIVALENT_REGISTER_REMOVAL="NO" *)	 
+	reg	precalc_n_coeff_ena = 1'b0;
+	wire	precalc_n_coeff_rdy;
+	
+	modexpa7_n_coeff #
+	(
+		.OPERAND_ADDR_WIDTH	(OPERAND_ADDR_WIDTH)
+	)
+	precalc_n_coeff
+	(
+		.clk						(clk),
+		.rst_n					(rst_n),
+
+		.ena						(precalc_n_coeff_ena),
+		.rdy						(precalc_n_coeff_rdy),
+
+		.n_bram_addr			(core_n4_addr),
+		.n_coeff_bram_addr	(core_n_coeff_addr_wr),
+
+		.n_bram_out				(core_n4_data),
+
+		.n_coeff_bram_in		(core_n_coeff_data_wr),
+		.n_coeff_bram_wr		(core_n_coeff_wren),
+
+		.n_num_words			(modulus_num_words_latch)
+	);		 
+
+		/*
+		 * Exponentiation module.
+		 */
+		 
+	reg	exponent_ena = 1'b0;
+	wire	exponent_rdy;
+		 
+	modexpa7_exponentiator #
+	(
+		.OPERAND_ADDR_WIDTH		(OPERAND_ADDR_WIDTH),
+		.SYSTOLIC_ARRAY_POWER	(SYSTOLIC_ARRAY_POWER)
+	)
+	exponent_r
+	(
+		.clk						(clk),
+		.rst_n					(rst_n),
+
+		.ena						(exponent_ena),
+		.rdy						(exponent_rdy),
+		
+		.m_bram_addr			(core_m_addr),
+		.d_bram_addr			(core_d_addr),
+		.f_bram_addr			(core_f_addr_rd),
+		.n1_bram_addr			(core_n1_addr),
+		.n2_bram_addr			(core_n2_addr),
+		.n_coeff1_bram_addr	(core_n_coeff1_addr_rd),
+		.n_coeff2_bram_addr	(core_n_coeff2_addr_rd),
+		.r_bram_addr			(core_r_addr),
+
+		.m_bram_out				(core_m_data),
+		.d_bram_out				(core_d_data),
+		.f_bram_out				(core_f_data_rd),
+		.n1_bram_out			(core_n1_data),
+		.n2_bram_out			(core_n2_data),
+		.n_coeff1_bram_out	(core_n_coeff1_data_rd),
+		.n_coeff2_bram_out	(core_n_coeff2_data_rd),
+
+		.r_bram_in				(core_r_data),
+		.r_bram_wr				(core_r_wren),
+
+		.m_num_words			(modulus_num_words_latch),
+		.d_num_bits				(exponent_num_bits_latch)
+	);
+	
+	
+		/*
+		 * Sub-Module Enable Logic
+		 */
+		 
+	always @(posedge clk) begin
+		precalc_f_ena			<= (fsm_next_state == FSM_STATE_PRECALC_START)  ? 1'b1 : 1'b0;
+		precalc_n_coeff_ena	<= (fsm_next_state == FSM_STATE_PRECALC_START)  ? 1'b1 : 1'b0;
+		exponent_ena			<= (fsm_next_state == FSM_STATE_EXPONENT_START) ? 1'b1 : 1'b0;
+	end
+		
+		
+	
+	
+		/*
+		 * FSM Process
+		 */
+		 
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
+		else						fsm_state <= fsm_next_state;
+	
+	
+		/*
+		 * FSM Transition Logic
+		 */
+		 
+		 // handy flag that tells whether both pre-calculations modules are idle
+	wire	precalc_rdy = precalc_n_coeff_rdy && precalc_r_rdy;
+	
+	always @* begin
+		//
+		fsm_next_state = FSM_STATE_STOP;
+		//
+		case (fsm_state)
+			//
+			FSM_STATE_IDLE:					if (init_trig)			fsm_next_state = FSM_STATE_PRECALC_START;		// init has priority over next
+													else if (next_trig)	fsm_next_state = FSM_STATE_EXPONENT_START;
+													else						fsm_next_state = FSM_STATE_IDLE;
+			//
+			FSM_STATE_PRECALC_START:									fsm_next_state = FSM_STATE_PRECALC_CRUNCH;
+			FSM_STATE_PRECALC_CRUNCH:		if (precalc_rdy)		fsm_next_state = FSM_STATE_PRECALC_FINAL;
+													else						fsm_next_state = FSM_STATE_PRECALC_CRUNCH;
+			FSM_STATE_PRECALC_FINAL:									fsm_next_state = FSM_STATE_STOP;
+			//
+			FSM_STATE_EXPONENT_START:									fsm_next_state = FSM_STATE_EXPONENT_CRUNCH;
+			FSM_STATE_EXPONENT_CRUNCH:		if (exponent_rdy)		fsm_next_state = FSM_STATE_EXPONENT_FINAL;
+													else						fsm_next_state = FSM_STATE_EXPONENT_CRUNCH;
+			FSM_STATE_EXPONENT_FINAL:									fsm_next_state = FSM_STATE_STOP;
+			//
+			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
+			//
+		endcase
+		//
+	end
+
+
+		/*
+		 * Bus read mux.
+		 */
+		 
+		// delay bus_addr_bank by 1 clock cycle to remember from where we've just been reading
+	reg	[1: 0]	bus_addr_bank_dly;
+	always @(posedge clk)
+		if (bus_cs) bus_addr_bank_dly <= bus_addr_bank;
+
+		// map mux to output port
+	reg	[31: 0]	bus_data_rd_mux;
+	assign bus_data_rd = bus_data_rd_mux;
+
+		// select the right data word
+	always @(*)
+		//
+		case (bus_addr_bank_dly)
+			//
+			BANK_MODULUS:		bus_data_rd_mux = user_n_data;
+			BANK_MESSAGE:		bus_data_rd_mux = user_m_data;
+			BANK_EXPONENT:		bus_data_rd_mux = user_d_data;
+			BANK_RESULT:		bus_data_rd_mux = user_r_data;
+			//
+		endcase
+		
+
+endmodule
diff --git a/src/rtl/pe/modexpa7_primitive_switch.v b/src/rtl/pe/modexpa7_primitive_switch.v
index d38069b..3551d7a 100644
--- a/src/rtl/pe/modexpa7_primitive_switch.v
+++ b/src/rtl/pe/modexpa7_primitive_switch.v
@@ -1,4 +1,4 @@
-//`define USE_VENDOR_PRIMITIVES
+`define USE_VENDOR_PRIMITIVES
 
 `ifdef USE_VENDOR_PRIMITIVES
 



More information about the Commits mailing list