[Cryptech-Commits] [core/math/modexpng] 90/92: More elegant way to do partial product recombination: * take advantage of the cascade paths between DSP slices * decrease latency of operation
git at cryptech.is
git at cryptech.is
Sat Mar 14 18:20:09 UTC 2020
This is an automated email from the git hooks/post-receive script.
paul at psgd.org pushed a commit to branch master
in repository core/math/modexpng.
commit 2791a17430c5b0c3291be3824aa8cdf07f305e92
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Tue Feb 11 15:54:22 2020 +0300
More elegant way to do partial product recombination:
* take advantage of the cascade paths between DSP slices
* decrease latency of operation
---
rtl/modexpng_recombinator_block.v | 83 +++++------------
rtl/modexpng_recombinator_cell.v | 185 +++++++++++++++++++-------------------
2 files changed, 115 insertions(+), 153 deletions(-)
diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_block.v
index e3cb50f..62d84e1 100644
--- a/rtl/modexpng_recombinator_block.v
+++ b/rtl/modexpng_recombinator_block.v
@@ -496,10 +496,8 @@ module modexpng_recombinator_block
reg rcmb_xy_lsb_ce = 1'b0;
reg rcmb_xy_lsb_ce_aux = 1'b0;
- reg rcmb_xy_lsb_ce_aux_dly = 1'b0;
reg [ 2:0] rcmb_xy_lsb_ce_purge = 3'b000;
- wire rcmb_xy_lsb_ce_combined = rcmb_xy_lsb_ce | rcmb_xy_lsb_ce_aux | rcmb_xy_lsb_ce_purge[0];
- wire rcmb_xy_lsb_ce_combined_ext = rcmb_xy_lsb_ce | rcmb_xy_lsb_ce_aux | rcmb_xy_lsb_ce_purge[0] | rcmb_xy_lsb_ce_aux_dly;
+ wire rcmb_xy_lsb_ce_combined = rcmb_xy_lsb_ce | rcmb_xy_lsb_ce_aux | rcmb_xy_lsb_ce_purge[0];
reg rcmb_xy_lsb_clr;
wire rcmb_xy_lsb_cry = !xy_valid_latch_lsb && rcmb_xy_lsb_ce_purge[1];
@@ -512,9 +510,7 @@ module modexpng_recombinator_block
reg rcmb_xy_msb_ce = 1'b0;
reg [ 1:0] rcmb_xy_msb_ce_purge = 2'b00;
- reg rcmb_xy_msb_ce_purge0_rectangle_dly = 1'b0;
- wire rcmb_xy_msb_ce_combined = rcmb_xy_msb_ce | rcmb_xy_msb_ce_purge[0];
- wire rcmb_xy_msb_ce_combined_ext = rcmb_xy_msb_ce | rcmb_xy_msb_ce_purge[0] | rcmb_xy_msb_ce_purge0_rectangle_dly;
+ wire rcmb_xy_msb_ce_combined = rcmb_xy_msb_ce | rcmb_xy_msb_ce_purge[0];
reg rcmb_xy_msb_clr;
reg [ MAC_W -1:0] rcmb_x_msb_din;
@@ -522,46 +518,44 @@ module modexpng_recombinator_block
wire [WORD_W -1:0] rcmb_x_msb_dout;
wire [WORD_W -1:0] rcmb_y_msb_dout;
- always @(posedge clk) rcmb_xy_lsb_ce_aux_dly <= rcmb_xy_lsb_ce_aux;
- always @(posedge clk) rcmb_xy_msb_ce_purge0_rectangle_dly <= rcmb_mode == RCMB_MODE_RECTANGLE ? rcmb_xy_msb_ce_purge[0] : 1'b0;
-
- modexpng_recombinator_cell recomb_x_lsb_new
+ modexpng_recombinator_cell recomb_x_lsb
(
.clk (clk),
- .ce (rcmb_xy_lsb_ce_combined_ext),
+ .ce (rcmb_xy_lsb_ce_combined),
.clr (rcmb_xy_lsb_clr),
.din (rcmb_x_lsb_din),
.dout (rcmb_x_lsb_dout),
.doutw (rcmb_x_lsb_doutw)
);
- modexpng_recombinator_cell recomb_y_lsb_new
+ modexpng_recombinator_cell recomb_y_lsb
(
.clk (clk),
- .ce (rcmb_xy_lsb_ce_combined_ext),
+ .ce (rcmb_xy_lsb_ce_combined),
.clr (rcmb_xy_lsb_clr),
.din (rcmb_y_lsb_din),
.dout (rcmb_y_lsb_dout),
.doutw (rcmb_y_lsb_doutw)
);
- modexpng_recombinator_cell recomb_x_msb_new
+ modexpng_recombinator_cell recomb_x_msb
(
.clk (clk),
- .ce (rcmb_xy_msb_ce_combined_ext),
+ .ce (rcmb_xy_msb_ce_combined),
.clr (rcmb_xy_msb_clr),
.din (rcmb_x_msb_din),
.dout (rcmb_x_msb_dout),
.doutw ()
);
- modexpng_recombinator_cell recomb_y_msb_new
+ modexpng_recombinator_cell recomb_y_msb
(
.clk (clk),
- .ce (rcmb_xy_msb_ce_combined_ext),
+ .ce (rcmb_xy_msb_ce_combined),
.clr (rcmb_xy_msb_clr),
.din (rcmb_y_msb_din),
.dout (rcmb_y_msb_dout),
.doutw ()
);
+
always @(posedge clk) begin
//
@@ -596,8 +590,8 @@ module modexpng_recombinator_block
rcmb_x_lsb_din <= dsp_x_p_latch[NUM_MULTS_AUX-1];
rcmb_y_lsb_din <= dsp_y_p_latch[NUM_MULTS_AUX-1];
end else if (rcmb_xy_lsb_cry) begin
- rcmb_x_lsb_din <= rcmb_x_msb_carry_1;
- rcmb_y_lsb_din <= rcmb_y_msb_carry_1;
+ rcmb_x_lsb_din <= {{(MAC_W-WORD_W){1'b0}}, rcmb_x_msb_carry_1};
+ rcmb_y_lsb_din <= {{(MAC_W-WORD_W){1'b0}}, rcmb_y_msb_carry_1};
end else begin
rcmb_x_lsb_din <= {MAC_W{1'b0}};
rcmb_y_lsb_din <= {MAC_W{1'b0}};
@@ -759,52 +753,17 @@ module modexpng_recombinator_block
end
- reg rcmb_xy_lsb_ce_combined_dly1 = 1'b0;
- reg rcmb_xy_msb_ce_combined_dly1 = 1'b0;
-
- reg rcmb_xy_lsb_ce_combined_dly2 = 1'b0;
- reg rcmb_xy_msb_ce_combined_dly2 = 1'b0;
-
- reg rcmb_xy_lsb_ce_combined_dly3 = 1'b0;
- reg rcmb_xy_msb_ce_combined_dly3 = 1'b0;
-
- reg rcmb_xy_lsb_ce_combined_dly4 = 1'b0;
- reg rcmb_xy_msb_ce_combined_dly4 = 1'b0;
-
- reg rcmb_xy_lsb_ce_combined_dly5 = 1'b0;
- reg rcmb_xy_msb_ce_combined_dly5 = 1'b0;
-
- reg rcmb_xy_lsb_ce_combined_dly6 = 1'b0;
- reg rcmb_xy_msb_ce_combined_dly6 = 1'b0;
+ reg [4:1] rcmb_xy_lsb_ce_combined_dly;
+ reg [4:1] rcmb_xy_msb_ce_combined_dly;
always @(posedge clk or negedge rst_n)
//
if (!rst_n) begin
- rcmb_xy_lsb_ce_combined_dly1 <= 1'b0;
- rcmb_xy_msb_ce_combined_dly1 <= 1'b0;
- rcmb_xy_lsb_ce_combined_dly2 <= 1'b0;
- rcmb_xy_msb_ce_combined_dly2 <= 1'b0;
- rcmb_xy_lsb_ce_combined_dly3 <= 1'b0;
- rcmb_xy_msb_ce_combined_dly3 <= 1'b0;
- rcmb_xy_lsb_ce_combined_dly4 <= 1'b0;
- rcmb_xy_msb_ce_combined_dly4 <= 1'b0;
- rcmb_xy_lsb_ce_combined_dly5 <= 1'b0;
- rcmb_xy_msb_ce_combined_dly5 <= 1'b0;
- rcmb_xy_lsb_ce_combined_dly6 <= 1'b0;
- rcmb_xy_msb_ce_combined_dly6 <= 1'b0;
+ rcmb_xy_lsb_ce_combined_dly <= 4'b0000;
+ rcmb_xy_msb_ce_combined_dly <= 4'b0000;
end else begin
- rcmb_xy_lsb_ce_combined_dly1 <= rcmb_xy_lsb_ce_combined;
- rcmb_xy_msb_ce_combined_dly1 <= rcmb_xy_msb_ce_combined;
- rcmb_xy_lsb_ce_combined_dly2 <= rcmb_xy_lsb_ce_combined_dly1;
- rcmb_xy_msb_ce_combined_dly2 <= rcmb_xy_msb_ce_combined_dly1;
- rcmb_xy_lsb_ce_combined_dly3 <= rcmb_xy_lsb_ce_combined_dly2;
- rcmb_xy_msb_ce_combined_dly3 <= rcmb_xy_msb_ce_combined_dly2;
- rcmb_xy_lsb_ce_combined_dly4 <= rcmb_xy_lsb_ce_combined_dly3;
- rcmb_xy_msb_ce_combined_dly4 <= rcmb_xy_msb_ce_combined_dly3;
- rcmb_xy_lsb_ce_combined_dly5 <= rcmb_xy_lsb_ce_combined_dly4;
- rcmb_xy_msb_ce_combined_dly5 <= rcmb_xy_msb_ce_combined_dly4;
- rcmb_xy_lsb_ce_combined_dly6 <= rcmb_xy_lsb_ce_combined_dly5;
- rcmb_xy_msb_ce_combined_dly6 <= rcmb_xy_msb_ce_combined_dly5;
+ rcmb_xy_lsb_ce_combined_dly <= {rcmb_xy_lsb_ce_combined_dly[3:1], rcmb_xy_lsb_ce_combined};
+ rcmb_xy_msb_ce_combined_dly <= {rcmb_xy_msb_ce_combined_dly[3:1], rcmb_xy_msb_ce_combined};
end
reg rcmb_xy_lsb_valid = 1'b0;
@@ -816,8 +775,8 @@ module modexpng_recombinator_block
rcmb_xy_lsb_valid <= 1'b0;
rcmb_xy_msb_valid <= 1'b0;
end else begin
- rcmb_xy_lsb_valid <= rcmb_xy_lsb_ce_combined_dly6;
- rcmb_xy_msb_valid <= rcmb_xy_msb_ce_combined_dly6;
+ rcmb_xy_lsb_valid <= rcmb_xy_lsb_ce_combined_dly[4];
+ rcmb_xy_msb_valid <= rcmb_xy_msb_ce_combined_dly[4];
end
diff --git a/rtl/modexpng_recombinator_cell.v b/rtl/modexpng_recombinator_cell.v
index 0c9ab00..28d17f2 100644
--- a/rtl/modexpng_recombinator_cell.v
+++ b/rtl/modexpng_recombinator_cell.v
@@ -58,39 +58,51 @@ module modexpng_recombinator_cell
//
- // din <=> {z[13:0], y[15:0], x[15:0]}
+ // Pipelined Clock Enable, Clear, Data Input
//
- wire [WORD_W -1:0] din_z = {2'b00, din[3 * WORD_W -3 : 2 * WORD_W]}; // [47:46][45:32]
- wire [WORD_W -1:0] din_y = { din[2 * WORD_W -1 : WORD_W]}; // [31:16]
- wire [WORD_W -1:0] din_x = { din[ WORD_W -1 : 0]}; // [15: 0]
+ reg ce_pipe = 1'b0;
+ reg clr_pipe;
+ reg [MAC_W-1:0] din_pipe;
+
+ always @(posedge clk)
+ {ce_pipe, clr_pipe, din_pipe} <= {ce, clr, din};
//
- // Delayed Clock Enables
+ // din_pipe <=> {z[13:0], y[15:0], x[15:0]}
//
- reg ce_dly1 = 1'b0, ce_dly2 = 1'b0, ce_dly3 = 1'b0, ce_dly4 = 1'b0, ce_dly5 = 1'b0, ce_dly6 = 1'b0;
- always @(posedge clk) {ce_dly1, ce_dly2, ce_dly3, ce_dly4, ce_dly5, ce_dly6} <= {ce, ce_dly1, ce_dly2, ce_dly3, ce_dly4, ce_dly5};
+ wire [WORD_W -1:0] din_z = {2'b00, din_pipe[3 * WORD_W -3 : 2 * WORD_W]}; // (47:46)[45:32]
+ wire [WORD_W -1:0] din_y = { din_pipe[2 * WORD_W -1 : WORD_W]}; // [31:16]
+ wire [WORD_W -1:0] din_x = { din_pipe[ WORD_W -1 : 0]}; // [15: 0]
-
- //
- // Delayed Clear
- //
- reg clr_dly1, clr_dly2, clr_dly3, clr_dly4;
- always @(posedge clk) {clr_dly1, clr_dly2, clr_dly3, clr_dly4} <= {clr, clr_dly1, clr_dly2, clr_dly3};
-
//
// Phase Flip-Flop
//
- reg phase_ff, phase_ff_dly1, phase_ff_dly2, phase_ff_dly3, phase_ff_dly4, phase_ff_dly5;
+ reg phase_ff = 1'b0;
+
always @(posedge clk)
- if (ce) phase_ff <= ~phase_ff;
- else if (clr) phase_ff <= 1'b0;
+ phase_ff <= ce_pipe ? ~phase_ff : 1'b0;
- always @(posedge clk)
- {phase_ff_dly1, phase_ff_dly2, phase_ff_dly3, phase_ff_dly4, phase_ff_dly5} <= {phase_ff, phase_ff_dly1, phase_ff_dly2, phase_ff_dly3, phase_ff_dly4};
-
+
+ //
+ // Delayed Clock Enable, Clear, Data Input
+ //
+ wire master_ce_0;
+ reg master_ce_1 = 1'b0;
+ wire slave_ce_1;
+ reg slave_ce_2 = 1'b0;
+ reg dout_ce_3 = 1'b0;
+ reg dout_ce_4 = 1'b0;
+
+ assign master_ce_0 = ce_pipe;
+ assign slave_ce_1 = master_ce_1;
+ always @(posedge clk) master_ce_1 <= ce_pipe & ~phase_ff;
+ always @(posedge clk) slave_ce_2 <= slave_ce_1;
+ always @(posedge clk) {dout_ce_3, dout_ce_4} <= {slave_ce_2, dout_ce_3};
+
+
//
// Shift Registers
//
@@ -101,11 +113,11 @@ module modexpng_recombinator_cell
always @(posedge clk) begin
//
- if (ce) {din_x_dly1, din_y_dly1, din_z_dly1} <= {din_x, din_y, din_z};
- else if (clr) {din_x_dly1, din_y_dly1, din_z_dly1} <= {WORD_ZERO, WORD_ZERO, WORD_ZERO};
+ if (ce_pipe) {din_x_dly1, din_y_dly1, din_z_dly1} <= {din_x, din_y, din_z};
+ else if (clr_pipe) {din_x_dly1, din_y_dly1, din_z_dly1} <= {WORD_ZERO, WORD_ZERO, WORD_ZERO};
//
- if (ce) {din_z_dly2} <= {din_z_dly1};
- else if (clr) {din_z_dly2} <= {WORD_ZERO};
+ if (ce_pipe) {din_z_dly2} <= {din_z_dly1};
+ else if (clr_pipe) {din_z_dly2} <= {WORD_ZERO};
//
end
@@ -113,82 +125,65 @@ module modexpng_recombinator_cell
//
// DSP Input Registers
//
- reg [2 * WORD_W-1:0] master_ab_reg;
- reg [2 * WORD_W-1:0] master_c_reg;
-
- reg [ WORD_W+1:0] slave_ab_reg;
- reg [ WORD_W+1:0] slave_ab_next_reg;
-
+ wire [2 * WORD_W-1:0] master_ab;
+ wire [2 * WORD_W-1:0] master_c;
+ wire [2 * WORD_W-1:0] slave_ab;
+ reg slave_c;
+
+ assign master_ab = {din_y, din_y_dly1};
+ assign master_c = {din_z_dly1, din_z_dly2};
+ assign slave_ab = {din_x, din_x_dly1};
+
//
// DSP Cascade Bus
//
wire [DSP48E1_P_W-1:0] master_slave_p_int;
-
+
//
// DSP Output Buses
//
- wire [DSP48E1_P_W-1:0] master_p_int;
+ wire master_carry_out_int;
wire [DSP48E1_P_W-1:0] slave_p_int;
-
-
- //
- // DSP Input Mapping
- //
- wire [DSP48E1_C_W-1:0] master_ab_int = {{(DSP48E1_C_W - 2 * WORD_W){1'b0}}, master_ab_reg};
- wire [DSP48E1_C_W-1:0] master_c_int = {{(DSP48E1_C_W - 2 * WORD_W){1'b0}}, master_c_reg};
-
- wire [DSP48E1_C_W-1:0] slave_ab_int = {{(DSP48E1_C_W - (WORD_W+3)){1'b0}}, slave_ab_reg[WORD_W+1:WORD_W], 1'b1, slave_ab_reg[WORD_W-1:0]};
- wire [DSP48E1_C_W-1:0] slave_c_int = {DSP48E1_C_W{1'b0}};
+ wire slave_carry_out_int;
//
- // Master DSP Input Logic
+ // Custom Carry Cascade
//
always @(posedge clk)
//
- if (ce) begin
- master_ab_reg <= !phase_ff ? {din_y, din_y_dly1} : {din_x, din_x_dly1};
- master_c_reg <= !phase_ff ? {din_z_dly1, din_z_dly2} : {WORD_DNC, WORD_DNC};
- end else begin
- master_ab_reg <= {WORD_DNC, WORD_DNC};
- master_c_reg <= {WORD_DNC, WORD_DNC};
- end
-
+ if (slave_ce_2) slave_c <= master_carry_out_int;
+
//
- // Slave DSP Input Logic
+ // DSP Input Mapping
//
- always @(posedge clk) begin
- //
- slave_ab_reg <= {(WORD_W+2){1'bX}};
- slave_ab_next_reg <= {(WORD_W+2){1'bX}};
- //
- if (ce_dly3 && phase_ff_dly3) slave_ab_next_reg <= {master_p_int[2*WORD_W+1:WORD_W]};
- //
- if (ce_dly3 && phase_ff_dly3) slave_ab_reg <= {2'b00, master_p_int[WORD_W-1:0]};
- if (ce_dly4 && phase_ff_dly4) slave_ab_reg <= slave_ab_next_reg;
- //
- end
-
+ wire [DSP48E1_C_W-1:0] master_ab_int = {master_ab, {(DSP48E1_C_W - 2*WORD_W){1'b0}}};
+ wire [DSP48E1_C_W-1:0] master_c_int = {master_c, {(DSP48E1_C_W - 2*WORD_W){1'b0}}};
+
+ wire [DSP48E1_C_W-1:0] slave_ab_int = {slave_ab, {(DSP48E1_C_W - 2*WORD_W){1'b0}}};
+ wire [DSP48E1_C_W-1:0] slave_c_int = {{(2*WORD_W-1){1'b0}}, slave_c, {(DSP48E1_C_W-2*WORD_W){1'b1}}};
+
//
- // OPMODE Logic
+ // DPS Modes
//
- reg [DSP48E1_OPMODE_W-1:0] master_opmode;
- reg [DSP48E1_OPMODE_W-1:0] slave_opmode;
+ wire [DSP48E1_OPMODE_W -1:0] master_opmode;
+ wire [DSP48E1_CARRYINSEL_W-1:0] master_carryinsel;
+
+ reg [DSP48E1_OPMODE_W -1:0] slave_opmode;
+ reg [DSP48E1_CARRYINSEL_W-1:0] slave_carryinsel;
+
+ assign master_opmode = DSP48E1_OPMODE_Z0_YC_XAB;
+ assign master_carryinsel = DSP48E1_CARRYINSEL_CARRYIN;
always @(posedge clk) begin
- //
- if (ce) master_opmode <= !phase_ff ? DSP48E1_OPMODE_Z0_YC_XAB : DSP48E1_OPMODE_ZP_Y0_XAB;
- else master_opmode <= DSP48E1_OPMODE_DNC;
- //
- if (ce_dly4) slave_opmode <= clr_dly4 ? DSP48E1_OPMODE_Z0_Y0_XAB : DSP48E1_OPMODE_ZP17_Y0_XAB;
- else slave_opmode <= DSP48E1_OPMODE_DNC;
- //
+ slave_opmode <= clr_pipe ? DSP48E1_OPMODE_ZPCIN_Y0_XAB : DSP48E1_OPMODE_ZPCIN_YC_XAB;
+ slave_carryinsel <= clr_pipe ? DSP48E1_CARRYINSEL_CARRYIN : DSP48E1_CARRYINSEL_CARRYCASCOUT;
end
-
+
//
// DSP Slice Instances
@@ -196,42 +191,50 @@ module modexpng_recombinator_cell
`MODEXPNG_DSP_SLICE_ADDSUB dsp_master_inst
(
.clk (clk),
- .ce_abc (ce_dly1),
- .ce_p (ce_dly2),
- .ce_ctrl (ce_dly1),
+ .ce_abc (master_ce_0),
+ .ce_p (master_ce_1),
+ .ce_ctrl (master_ce_0),
.ab (master_ab_int),
.c (master_c_int),
- .p (master_p_int),
+ .p (),
.op_mode (master_opmode),
.alu_mode (DSP48E1_ALUMODE_Z_PLUS_X_AND_Y_AND_CIN),
- .carry_in_sel (DSP48E1_CARRYINSEL_CARRYIN),
+ .carry_in_sel (master_carryinsel),
.casc_p_in (),
- .casc_p_out (),
- .carry_out ()
+ .casc_p_out (master_slave_p_int),
+ .carry_out (master_carry_out_int)
);
`MODEXPNG_DSP_SLICE_ADDSUB dsp_slave_inst
(
.clk (clk),
- .ce_abc (ce_dly5),
- .ce_p (ce_dly6),
- .ce_ctrl (ce_dly5),
+ .ce_abc (slave_ce_1),
+ .ce_p (slave_ce_2),
+ .ce_ctrl (slave_ce_1),
.ab (slave_ab_int),
.c (slave_c_int),
.p (slave_p_int),
.op_mode (slave_opmode),
.alu_mode (DSP48E1_ALUMODE_Z_PLUS_X_AND_Y_AND_CIN),
- .carry_in_sel (DSP48E1_CARRYINSEL_CARRYIN),
- .casc_p_in (),
+ .carry_in_sel (slave_carryinsel),
+ .casc_p_in (master_slave_p_int),
.casc_p_out (),
- .carry_out ()
+ .carry_out (slave_carry_out_int)
);
//
// Output Register
- //
- assign dout = {slave_p_int[WORD_W-1:0]};
- assign doutw = {slave_p_int[WORD_W+1], dout};
+ //
+ reg [WORD_W:0] doutx_reg;
+
+ assign dout = doutx_reg[WORD_W-1:0];
+ assign doutw = doutx_reg;
+
+ always @(posedge clk) begin
+ doutx_reg <= {1'bX, WORD_DNC};
+ if (dout_ce_4) doutx_reg <= {slave_carry_out_int, slave_p_int[DSP48E1_P_W - 0*WORD_W -1 -: WORD_W]};
+ if (dout_ce_3) doutx_reg <= {1'b0, slave_p_int[DSP48E1_P_W - 1*WORD_W -1 -: WORD_W]};
+ end
endmodule
More information about the Commits
mailing list