[Cryptech-Commits] [core/platform/common] 02/04: Reworked core selector generation script.

git at cryptech.is git at cryptech.is
Tue Feb 11 13:09:43 UTC 2020


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository core/platform/common.

commit 5807b0bfd7efe8dd3f83d679730241847517980b
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Thu Jan 23 13:08:59 2020 +0300

    Reworked core selector generation script.
    
    The core selector is now multi-cycle (see /core/platform/alpha commit
    35359243a63cac4a9e8cce6bd718f17756ce8a98 message for more details). In short,
    for write operations, every core now has its own copy of chip select, address
    and write data registers. For read operations we should never ever need the
    combinational readback multiplexor again, it just won't meet timing with so
    many complex cores. Cores with combinational outputs, primarily those that
    don't have block memory inside, always have additional output registers.
    Moreover, the readback multiplexor is now registered too, this is required to
    get the multicycle constraint to work properly (again, refer to the
    aforementioned commit message).
---
 config/core_config.py | 341 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 222 insertions(+), 119 deletions(-)

diff --git a/config/core_config.py b/config/core_config.py
index 4033279..61e77d5 100755
--- a/config/core_config.py
+++ b/config/core_config.py
@@ -5,7 +5,7 @@ Generate core_selector.v and core_vfiles.mk for a set of cores.
 """
 
 #=======================================================================
-# Copyright (c) 2015-2017, NORDUnet A/S All rights reserved.
+# Copyright (c) 2015-2017, 2019 NORDUnet A/S All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
@@ -34,21 +34,6 @@ Generate core_selector.v and core_vfiles.mk for a set of cores.
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #=======================================================================
 
-# The modexpa7 core drags in a one clock cycle delay to other cores,
-# to compensate for the extra clock cycle consumed by the block
-# memories used in the modexpa7 core.  We probably want a general
-# solution for this, because we're going to run into this problem for
-# any core that handles arguments big enough to require block memory.
-
-# To Do:
-#
-# - Consider automating the one-clock-cycle delay stuff by adding
-#   another boolean flag to the config file.  Default would be no
-#   delay, if any included core sets the "I use block memories" flag,
-#   all other cores would get the delay.  Slightly tedious but
-#   something we can calculate easily enough, and probably an
-#   improvement over wiring in the delay when nothing needs it.
-
 def main():
     """
     Parse arguments and config file, generate core list, generate output.
@@ -82,7 +67,7 @@ def main():
         Core.modexp = cfg.get(board_section, "modexp")
         if Core.extra_wires:
             # restore formatting
-            Core.extra_wires = Core.extra_wires.replace("\n", "\n   ") + "\n"
+            Core.extra_wires = Core.extra_wires.replace("\n", "\n    ") + "\n"
 
         if args.core:
             cores = args.core
@@ -98,7 +83,6 @@ def main():
             except ValueError:
                 if core not in cores:
                     cores.append(core)
-                
 
         cores.insert(0, "board_regs")
         cores.insert(1, "comm_regs")
@@ -111,21 +95,41 @@ def main():
         core_number = 0
         for core in cores:
             core_number = core.assign_core_number(core_number)
-
+            
+        for i, core in enumerate(cores):
+            core.assign_seq_number(i)
+
+        # On the unused piece of code below: we really should not try to
+        # optimize out the delay. This may have worked earlier, when we only
+        # had a small set of simple cores. There are a lot of complex cores
+        # by now, so the readback multiplexer gets pretty wide and will never
+        # meet timing if we make it purely combinatorial. Moreover, it turns
+        # out that additional delays are necessary to make it work at higher
+        # clock speeds.
         if False:
 
             # For some reason, attempting to optimize out the delay
             # code entirely results in a non-working bitstream.  Don't
             # know why, disabling the optimization works, so just do
             # that for now.
-
+            
             Core.need_one_cycle_delay = any(core.block_memory for core in cores)
 
+        # longest core/subcore instance name
+        max_name_len = 0
+        for core in cores:
+            if len(core.instance_name) > max_name_len:
+                max_name_len = len(core.instance_name)
+            for subcore in core.subcores:
+                if len(subcore.instance_name) > max_name_len:
+                    max_name_len = len(subcore.instance_name)
+
         args.verilog.write(createModule_template.format(
+            core_count = len(cores),
             core = cores[0],
-            addrs = "".join(core.createAddr()     for core in cores),
-            insts = "".join(core.createInstance() for core in cores),
-            muxes = "".join(core.createMux()      for core in cores) ))
+            addrs = "".join(core.createAddr(max_name_len) for core in cores),
+            insts = "".join(core.createInstance()         for core in cores),
+            muxes = "".join(core.createMux()              for core in cores) ))
 
         args.makefile.write(listVfiles_template.format(
             vfiles = "".join(core.listVfiles()    for core in cores)))
@@ -193,6 +197,7 @@ class Core(object):
         self.name = name
         self.cfg_section = "core " + name
         self.core_number = None
+        self.seq_number = None
         self.vfiles = []
         self.error_wire = True
         self.block_memory = False
@@ -211,6 +216,9 @@ class Core(object):
             subcore.assign_core_number(n + i + 1)
         return n + self.blocks
 
+    def assign_seq_number(self, n):
+        self.seq_number = n
+
     def configure(self, cfg):
         if self.instance_number == 0:
             self.vfiles.extend(cfg.getvalues(self.cfg_section, "vfiles"))
@@ -221,7 +229,7 @@ class Core(object):
         self.block_memory = cfg.getboolean(self.cfg_section, "block memory", self.block_memory)
         self.extra_ports = cfg.get(self.cfg_section, "extra ports")
         if self.extra_ports:
-            self.extra_ports = self.extra_ports.replace("\n", "\n      ") + "\n"
+            self.extra_ports = self.extra_ports.replace("\n", "\n        ") + "\n"
         self.blocks = int(cfg.get(self.cfg_section, "core blocks") or 1)
         self.block_max = self.blocks - 1
         if self.blocks > 1:
@@ -257,27 +265,43 @@ class Core(object):
 
     @property
     def error_wire_decl(self):
-        return "\n   wire                 error_{core.instance_name};".format(core = self) if self.error_wire else ""
+        return "\n    wire         error_{core.instance_name};".format(core = self) if self.error_wire else ""
 
     @property
     def error_port(self):
-        return ",\n      .error(error_{core.instance_name})".format(core = self) if self.error_wire else ""
+        return ",\n        .error(error_{core.instance_name})".format(core = self) if self.error_wire else ""
 
     @property
     def one_cycle_delay(self):
         return one_cycle_delay_template.format(core = self) if self.need_one_cycle_delay and not self.block_memory else ""
 
+    @property
+    def extra_pipeline_stage(self):
+        return extra_pipeline_stage_template.format(core = self)
+
     @property
     def mux_core_addr(self):
         if self.blocks == 1 or self.subcores:
             return "CORE_ADDR_{core.upper_instance_name}".format(core=self)
         else:
-            return ",\n       ".join("CORE_ADDR_{core.upper_instance_name} + {0}".format(i, core=self) for i in range(self.blocks)) 
+            return ",\n                ".join("CORE_ADDR_{core.upper_instance_name} + {core.addr_width}'h{0:04X}".format(i, core=self) for i in range(self.blocks)) 
 
     @property
-    def mux_data_reg(self):
-        return "read_data_" + self.instance_name + ("_reg" if self.need_one_cycle_delay and not self.block_memory else "")
+    def reg_data_out(self):
+        return "reg_read_data_" + self.instance_name
+        
+    @property
+    def comb_data_out(self):
+        return "comb_read_data_" + self.instance_name
+
+    @property
+    def wire_data_out(self):
+        return self.comb_data_out if self.need_one_cycle_delay and not self.block_memory else self.reg_data_out
 
+    @property
+    def pipe_data_out(self):
+        return "pipe_read_data_" + self.instance_name
+        
     @property
     def mux_error_reg(self):
         return "error_" + self.instance_name if self.error_wire else "0"
@@ -293,10 +317,10 @@ class Core(object):
         template = createInstance_template_dummy if self.dummy else createInstance_template_generic if self.blocks == 1 else createInstance_template_multi_block
         return template.format(core = self)
 
-    def createAddr(self):
+    def createAddr(self, max_name_len):
         if self.dummy:
             return ""
-        return createAddr_template.format(core = self) + "".join(subcore.createAddr() for subcore in self.subcores)
+        return createAddr_template.format(core = self, name_pad = max_name_len) + "".join(subcore.createAddr(max_name_len) for subcore in self.subcores)
 
     def createMux(self):
         if self.dummy:
@@ -328,32 +352,44 @@ class SubCore(Core):
 # Template used by .createAddr() methods.
 
 createAddr_template = """\
-   localparam   CORE_ADDR_{core.upper_instance_name:21s} = {core.addr_width}'h{core.core_number:02x};
+    localparam CORE_ADDR_{core.upper_instance_name:{name_pad}s} = {core.addr_width}'h{core.core_number:02x};
 """
 
 # Template used by Core.createInstance().
 
 createInstance_template_generic = """\
-   //----------------------------------------------------------------
-   // {core.upper_instance_name}
-   //----------------------------------------------------------------
-   wire                 enable_{core.instance_name} = (addr_core_num == CORE_ADDR_{core.upper_instance_name});
-   wire [31: 0]         read_data_{core.instance_name};{core.error_wire_decl}
-
-   {core.module_name} {core.parameters}{core.instance_name}_inst
-     (
-      .clk(sys_clk),
-      .{core.reset_name}(sys_rst_n),
+    //----------------------------------------------------------------
+    // {core.upper_instance_name}
+    //----------------------------------------------------------------
+    wire         enable_{core.instance_name} = (addr_core_num == CORE_ADDR_{core.upper_instance_name});
+    wire [31: 0] {core.wire_data_out};{core.error_wire_decl}
+
+                                                                    reg          select_{core.instance_name} = 1'b0;
+    (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg          write_{core.instance_name} = 1'b0;
+    (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg  [31: 0] write_data_{core.instance_name};
+    (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg  [ 7: 0] addr_{core.instance_name};
+    
+    always @(posedge sys_clk) begin
+        select_{core.instance_name} <= enable_{core.instance_name} && sys_{core.bus_name}_cs;
+        write_{core.instance_name} <= sys_{core.bus_name}_wr;
+        write_data_{core.instance_name} <= sys_write_data;
+        addr_{core.instance_name} <= addr_core_reg;
+    end
+
+    {core.module_name} {core.parameters}{core.instance_name}_inst
+    (
+        .clk(sys_clk),
+        .{core.reset_name}(sys_rst_n_fanout[{core.seq_number}]),
 {core.extra_ports}
-      .cs(enable_{core.instance_name} & (sys_{core.bus_name}_rd | sys_{core.bus_name}_wr)),
-      .we(sys_{core.bus_name}_wr),
-
-      .address(addr_core_reg),
-      .write_data(sys_write_data),
-      .read_data(read_data_{core.instance_name}){core.error_port}
-      );
+        .cs(select_{core.instance_name}),
+        .we(write_{core.instance_name}),
+        .address(addr_{core.instance_name}),
+        .write_data(write_data_{core.instance_name}),
+        .read_data({core.wire_data_out}){core.error_port}
+    );
 
 {core.one_cycle_delay}
+{core.extra_pipeline_stage}
 
 """
 
@@ -361,27 +397,39 @@ createInstance_template_generic = """\
 # enough from the base template that it's easier to make this separate.
 
 createInstance_template_multi_block = """\
-   //----------------------------------------------------------------
-   // {core.upper_instance_name}
-   //----------------------------------------------------------------
-   wire                 enable_{core.instance_name} = (addr_core_num >= CORE_ADDR_{core.upper_instance_name}) && (addr_core_num <= CORE_ADDR_{core.upper_instance_name} + {core.addr_width}'h{core.block_max:02x});
-   wire [31: 0]         read_data_{core.instance_name};{core.error_wire_decl}
-   wire [{core.block_bit_max}:0]           {core.instance_name}_prefix = addr_core_num[{core.block_bit_max}:0] - CORE_ADDR_{core.upper_instance_name};
-
-   {core.module_name} {core.parameters}{core.instance_name}_inst
-     (
-      .clk(sys_clk),
-      .{core.reset_name}(sys_rst_n),
+    //----------------------------------------------------------------
+    // {core.upper_instance_name}
+    //----------------------------------------------------------------
+    wire         enable_{core.instance_name} = (addr_core_num >= CORE_ADDR_{core.upper_instance_name}) && (addr_core_num <= (CORE_ADDR_{core.upper_instance_name} + {core.addr_width}'h{core.block_max:02x}));
+    wire [31: 0] {core.wire_data_out};{core.error_wire_decl}
+    wire [{core.block_bit_max:>2}: 0] prefix_{core.instance_name} = addr_core_num[{core.block_bit_max}:0] - CORE_ADDR_{core.upper_instance_name}[{core.block_bit_max}:0];
+
+                                                                    reg           select_{core.instance_name} = 1'b0;
+    (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg           write_{core.instance_name} = 1'b0;
+    (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg  [ 31: 0] write_data_{core.instance_name};
+    (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg  [{core.block_bits}+7: 0] addr_{core.instance_name};
+    
+    always @(posedge sys_clk) begin
+        select_{core.instance_name} <= enable_{core.instance_name} && sys_{core.bus_name}_cs;
+        write_{core.instance_name} <= sys_{core.bus_name}_wr;
+        write_data_{core.instance_name} <= sys_write_data;
+        addr_{core.instance_name} <= {{prefix_{core.instance_name}, addr_core_reg}};
+    end
+
+    {core.module_name} {core.parameters}{core.instance_name}_inst
+    (
+        .clk(sys_clk),
+        .{core.reset_name}(sys_rst_n_fanout[{core.seq_number}]),
 {core.extra_ports}
-      .cs(enable_{core.instance_name} & (sys_{core.bus_name}_rd | sys_{core.bus_name}_wr)),
-      .we(sys_{core.bus_name}_wr),
-
-      .address({{{core.instance_name}_prefix, addr_core_reg}}),
-      .write_data(sys_write_data),
-      .read_data(read_data_{core.instance_name}){core.error_port}
-      );
+        .cs(select_{core.instance_name}),
+        .we(write_{core.instance_name}),
+        .address(addr_{core.instance_name}),
+        .write_data(write_data_{core.instance_name}),
+        .read_data({core.wire_data_out}){core.error_port}
+    );
 
 {core.one_cycle_delay}
+{core.extra_pipeline_stage}
 
 """
 
@@ -395,19 +443,28 @@ createInstance_template_dummy = """\
 # Template for one-cycle delay code.
 
 one_cycle_delay_template = """\
-   reg  [31: 0] read_data_{core.instance_name}_reg;
-   always @(posedge sys_clk)
-     read_data_{core.instance_name}_reg <= read_data_{core.instance_name};
+    (* SHREG_EXTRACT="NO" *)
+    reg [31: 0] {core.reg_data_out};
+    always @(posedge sys_clk)
+        {core.reg_data_out} <= {core.wire_data_out};
+"""
+
+# Template for an extra delay cycle code.
+
+extra_pipeline_stage_template = """\
+    (* SHREG_EXTRACT="NO" *)
+    reg [31: 0] {core.pipe_data_out};
+    always @(posedge sys_clk)
+        {core.pipe_data_out} <= {core.reg_data_out};
 """
 
 # Template for .createMux() methods.
 
 createMux_template = """\
-       {core.mux_core_addr}:
-         begin
-            sys_read_data_mux = {core0.mux_data_reg};
-            sys_error_mux = {core0.mux_error_reg};
-         end
+                {core.mux_core_addr}: begin
+                    sys_read_data_mux <= {core0.pipe_data_out};
+                    sys_error_mux     <= {core0.mux_error_reg};
+                end
 """
 
 # Top-level (createModule) template.
@@ -416,56 +473,102 @@ createModule_template = """\
 // NOTE: This file is generated; do not edit.
 
 module core_selector
-  (
-   input wire          sys_clk,
-   input wire          sys_rst_n,
-
-   input wire [{core.bus_max}: 0]  sys_{core.bus_name}_addr,
-   input wire          sys_{core.bus_name}_wr,
-   input wire          sys_{core.bus_name}_rd,
-   output wire [31: 0] sys_read_data,
-   input wire [31: 0]  sys_write_data,
-   output wire         sys_error,
-{core.extra_wires}
-   input wire          noise,
-   output wire [7 : 0] debug
-   );
-
-
-   //----------------------------------------------------------------
-   // Address Decoder
-   //----------------------------------------------------------------
-   // upper {core.addr_width} bits specify core being addressed
-   wire [{core.addr_max:>2}: 0]         addr_core_num   = sys_{core.bus_name}_addr[{core.bus_max}: 8];
-   // lower 8 bits specify register offset in core
-   wire [ 7: 0]         addr_core_reg   = sys_{core.bus_name}_addr[ 7: 0];
+(
+    input  wire         sys_clk,
+    input  wire         sys_rst_n,
+
+    input  wire [{core.bus_max}: 0] sys_{core.bus_name}_addr,
+    input  wire         sys_{core.bus_name}_wr,
+    input  wire         sys_{core.bus_name}_rd,
+    output wire [31: 0] sys_read_data,
+    input  wire [31: 0] sys_write_data,
+    output wire         sys_error,
+    {core.extra_wires}
+    input  wire         noise,
+    output wire [ 7 :0] debug
+);
+
+
+    //----------------------------------------------------------------
+    // Localized Resets Generator
+    //----------------------------------------------------------------
+    wire [{core_count}-1:0] sys_rst_n_fanout;
+    reset_replicator #
+    (
+        .SHREG_WIDTH(8),
+        .FANOUT_WIDTH({core_count})
+    )
+    reset_replicator_inst
+    (
+        .sys_clk_in    (sys_clk),
+        .sys_rst_n_in  (sys_rst_n),
+        .sys_rst_n_out (sys_rst_n_fanout)
+    );
+
+
+    //----------------------------------------------------------------
+    // Address Decoder
+    //----------------------------------------------------------------
+    // upper {core.addr_width} bits specify core being addressed
+    // lower 8 bits specify register offset in core
+    wire [{core.addr_max:>2}: 0] addr_core_num = sys_{core.bus_name}_addr[{core.bus_max}: 8];
+    wire [ 7: 0] addr_core_reg = sys_{core.bus_name}_addr[ 7: 0];
+
+
+    //----------------------------------------------------------------
+    // Core Address Table
+    //----------------------------------------------------------------
+{addrs}
 
 
-   //----------------------------------------------------------------
-   // Core Address Table
-   //----------------------------------------------------------------
-{addrs}
+    //----------------------------------------------------------------
+    // Core Instances
+    //----------------------------------------------------------------
+    wire sys_{core.bus_name}_cs = sys_{core.bus_name}_rd || sys_{core.bus_name}_wr;
 
 {insts}
-   //----------------------------------------------------------------
-   // Output (Read Data) Multiplexer
-   //----------------------------------------------------------------
-   reg [31: 0]          sys_read_data_mux;
-   assign               sys_read_data = sys_read_data_mux;
-   reg                  sys_error_mux;
-   assign               sys_error = sys_error_mux;
 
-   always @*
-
-     case (addr_core_num)
+    
+    //----------------------------------------------------------------
+    // Output (Read Data) Multiplexer
+    //----------------------------------------------------------------
+    (* SHREG_EXTRACT="NO" *) reg sys_{core.bus_name}_cs_dly1 = 1'b0;
+    (* SHREG_EXTRACT="NO" *) reg sys_{core.bus_name}_cs_dly2 = 1'b0;
+    (* SHREG_EXTRACT="NO" *) reg sys_{core.bus_name}_cs_dly3 = 1'b0;
+
+    (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.addr_max:>2}: 0] addr_core_num_dly1;
+    (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.addr_max:>2}: 0] addr_core_num_dly2;
+    (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.addr_max:>2}: 0] addr_core_num_dly3;
+
+    always @(posedge sys_clk) begin
+        sys_{core.bus_name}_cs_dly1 <= sys_{core.bus_name}_cs;
+        sys_{core.bus_name}_cs_dly2 <= sys_{core.bus_name}_cs_dly1;
+        sys_{core.bus_name}_cs_dly3 <= sys_{core.bus_name}_cs_dly2;
+    end
+    
+    always @(posedge sys_clk) begin
+        if (sys_{core.bus_name}_cs)      addr_core_num_dly1 <= addr_core_num;
+        if (sys_{core.bus_name}_cs_dly1) addr_core_num_dly2 <= addr_core_num_dly1;
+        if (sys_{core.bus_name}_cs_dly2) addr_core_num_dly3 <= addr_core_num_dly2;
+    end
+    
+    reg [31: 0] sys_read_data_mux;
+    reg         sys_error_mux;
+
+    assign sys_read_data = sys_read_data_mux;
+    assign sys_error     = sys_error_mux;
+
+    always @(posedge sys_clk)
+
+        if (sys_{core.bus_name}_cs_dly3)
+        
+            case (addr_core_num_dly3)
 {muxes}
-       default:
-         begin
-            sys_read_data_mux = {{32{{1'b0}}}};
-            sys_error_mux = 1;
-         end
-     endcase
-
+                default: begin
+                    sys_read_data_mux <= {{32{{1'b0}}}};
+                    sys_error_mux     <= 1'b1;
+                end
+            endcase
 
 endmodule
 



More information about the Commits mailing list