[Cryptech-Commits] [core/math/ecdsalib] 01/04: Initial commit of base point multiplier core for ECDSA curve P-256.

git at cryptech.is git at cryptech.is
Wed Mar 8 03:13:10 UTC 2017


This is an automated email from the git hooks/post-receive script.

sra at hactrn.net pushed a commit to branch master
in repository core/math/ecdsalib.

commit 25e338149fdb8e06c82d99600769a8498a85ef2c
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Oct 31 00:14:51 2016 +0300

    Initial commit of base point multiplier core for ECDSA curve P-256.
---
 bench/tb_curve_adder_256.v                         | 420 +++++++++
 bench/tb_curve_doubler_256.v                       | 409 +++++++++
 bench/tb_curve_multiplier_256.v                    | 281 ++++++
 bench/tb_lowlevel_adder32.v                        | 175 ++++
 bench/tb_lowlevel_adder47.v                        | 151 ++++
 bench/tb_lowlevel_subtractor32.v                   | 174 ++++
 bench/tb_modular_adder.v                           | 357 ++++++++
 bench/tb_modular_invertor.v                        | 226 +++++
 bench/tb_modular_multiplier_256.v                  | 366 ++++++++
 bench/tb_modular_subtractor.v                      | 356 ++++++++
 bench/tb_mw_comparator.v                           | 322 +++++++
 bench/tb_mw_mover.v                                | 282 ++++++
 rtl/curve/curve_dbl_add_256.v                      | 868 ++++++++++++++++++
 rtl/curve/curve_mul_256.v                          | 720 +++++++++++++++
 rtl/curve/rom/brom_p256_delta.v                    |  68 ++
 rtl/curve/rom/brom_p256_g_x.v                      |  68 ++
 rtl/curve/rom/brom_p256_g_y.v                      |  68 ++
 rtl/curve/rom/brom_p256_h_x.v                      |  68 ++
 rtl/curve/rom/brom_p256_h_y.v                      |  68 ++
 rtl/curve/rom/brom_p256_one.v                      |  68 ++
 rtl/curve/rom/brom_p256_q.v                        |  68 ++
 rtl/curve/rom/brom_p256_zero.v                     |  70 ++
 rtl/curve/uop/uop_add_rom.v                        |  66 ++
 rtl/curve/uop/uop_conv_rom.v                       |  38 +
 rtl/curve/uop/uop_dbl_rom.v                        |  58 ++
 rtl/curve/uop/uop_init_rom.v                       |  33 +
 rtl/curve/uop_ecdsa.v                              |  50 ++
 rtl/ecdsa256.v                                     | 160 ++++
 rtl/ecdsa256_wrapper.v                             | 177 ++++
 rtl/lowlevel/adder32_wrapper.v                     |  73 ++
 rtl/lowlevel/adder47_wrapper.v                     |  69 ++
 rtl/lowlevel/artix7/adder32_artix7.v               |  96 ++
 rtl/lowlevel/artix7/adder47_artix7.v               |  91 ++
 rtl/lowlevel/artix7/dsp48e1_wrapper.v              | 159 ++++
 rtl/lowlevel/artix7/mac16_artix7.v                 |  90 ++
 rtl/lowlevel/artix7/subtractor32_artix7.v          |  94 ++
 rtl/lowlevel/ecdsa_lowlevel_settings.v             |  17 +
 rtl/lowlevel/mac16_wrapper.v                       |  75 ++
 rtl/lowlevel/subtractor32_wrapper.v                |  72 ++
 rtl/modular/modular_adder.v                        | 298 +++++++
 .../modular_invertor/helper/modinv_helper_copy.v   | 148 ++++
 .../modular_invertor/helper/modinv_helper_init.v   | 172 ++++
 .../helper/modinv_helper_invert_compare.v          | 286 ++++++
 .../helper/modinv_helper_invert_precalc.v          | 408 +++++++++
 .../helper/modinv_helper_invert_update.v           | 257 ++++++
 .../helper/modinv_helper_reduce_precalc.v          | 328 +++++++
 .../helper/modinv_helper_reduce_update.v           | 153 ++++
 rtl/modular/modular_invertor/modinv_clog2.v        |  10 +
 rtl/modular/modular_invertor/modular_invertor.v    | 981 +++++++++++++++++++++
 rtl/modular/modular_multiplier_256.v               | 402 +++++++++
 rtl/modular/modular_reductor_256.v                 | 666 ++++++++++++++
 rtl/modular/modular_subtractor.v                   | 292 ++++++
 rtl/multiword/mw_comparator.v                      | 220 +++++
 rtl/multiword/mw_mover.v                           | 175 ++++
 rtl/util/bram_1rw_1ro_readfirst.v                  | 101 +++
 55 files changed, 11968 insertions(+)

diff --git a/bench/tb_curve_adder_256.v b/bench/tb_curve_adder_256.v
new file mode 100644
index 0000000..a20743a
--- /dev/null
+++ b/bench/tb_curve_adder_256.v
@@ -0,0 +1,420 @@
+//------------------------------------------------------------------------------
+//
+// tb_curve_adder_256.v
+// -----------------------------------------------------------------------------
+// Testbench for 256-bit curve point adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+module tb_curve_adder_256;
+
+
+		//
+		// Test Vectors
+		//
+	localparam	[255:0]	PX_1	= 256'ha536512112e4bb911ae72744a914761ddc53700f889c88e583e0edd45c179b08;
+	localparam	[255:0]	PY_1	= 256'h239e73bf40f4831ab71ccea072291893ac8582982ea6fec6bd6aaf36ac32d22e;
+	localparam	[255:0]	PZ_1	= 256'h32258ae04c5498bb34b29c54a7f95afc10c009540c51731eae164750ca385029;
+
+	localparam	[255:0]	RX_1	= 256'he4fcdd1a151b405b2a567d20d7674031c6d5b207b0b5dcf277015d81784492d5;
+	localparam	[255:0]	RY_1	= 256'h4782c540b58988b07bb8e0c5ad3ff562dd45c075a39ee71896d5eb33702dd656;
+	localparam	[255:0]	RZ_1	= 256'hae637ff2fd5468780241afb3a8ebaeb8618e86b4a1a211b350546c9e6fea93d4;
+
+	localparam	[255:0]	PX_2	= 256'he58a6470e038f6b261d5a9a72fb2bd96b6bad433ff7baea6a40b5facf5085189;
+	localparam	[255:0]	PY_2	= 256'h03dd8785b592307811ee5512e2d713c5dc65f60f01883340fe0f56f858a39474;
+	localparam	[255:0]	PZ_2	= 256'h1b4657b1e79c9074fbf7f63f96ce2854db4808afc72841fac623dc68d9bff64d;
+
+	localparam	[255:0]	RX_2	= 256'hc354e99a827a3f1c30f29f6b1d72273eb0daaeb06bb373ed315e305b89d857ca;
+	localparam	[255:0]	RY_2	= 256'h0cb054f95589c1fcbe763df3b8d7badd568d5e93a667076dddfc70dcfab74948;
+	localparam	[255:0]	RZ_2	= 256'hd79d9170dd628aee82d149715a6ec6cc44426ccae236d2a146edbd15a564ea53;
+
+	localparam	[255:0]	PX_3	= 256'hbf5fe30c79025a0b638b0fd62bf1349aee0a9fc7fc2719291b0c23535c16eb52;
+	localparam	[255:0]	PY_3	= 256'h8a637c7c0b9459de664d40a717e1abc0f843f03169fae943e0835cbe767da06b;
+	localparam	[255:0]	PZ_3	= 256'h0871d93601d654216912866514a788a92e8a9b6047611bf185d459e204727377;
+
+	localparam	[255:0]	RX_3	= 256'h1ba6259b5b750e4d6e4f490f661646cd9491be16965f47044ac2688048e567c5;
+	localparam	[255:0]	RY_3	= 256'h80e55c16f403f8d7282bca628477771a45330567caa5aaab9a54919dbe05e3e4;
+	localparam	[255:0]	RZ_3	= 256'hb99663f045c9602b05f23aaaa508e6167d15740be900175dbeceb957a9dad951;
+
+	localparam	[255:0]	PX_4	= 256'hxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx;
+	localparam	[255:0]	PY_4	= 256'hxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx;
+	localparam	[255:0]	PZ_4	= 256'h0000000000000000000000000000000000000000000000000000000000000000;
+
+	localparam	[255:0]	RX_4	= 256'h6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296; // G.x
+	localparam	[255:0]	RY_4	= 256'h4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5; // G.y
+	localparam	[255:0]	RZ_4	= 256'h0000000000000000000000000000000000000000000000000000000000000001;
+
+	localparam	[255:0]	PX_5	= 256'h6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296; // G.x
+	localparam	[255:0]	PY_5	= 256'h4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5; // G.y
+	localparam	[255:0]	PZ_5	= 256'h0000000000000000000000000000000000000000000000000000000000000001;
+
+	localparam	[255:0]	RX_5	= 256'h29d05c193da77b710e86323538b77e1b11f904fea42998be16bd8d744ece7ad0; // H.x
+	localparam	[255:0]	RY_5	= 256'hb01cbd1c01e58065711814b583f061e9d431cca994cea1313449bf97c840ae07; // H.y
+	localparam	[255:0]	RZ_5	= 256'h0000000000000000000000000000000000000000000000000000000000000001;
+	
+	localparam	[255:0]	PX_6	= 256'h6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296; // G.x
+	localparam	[255:0]	PY_6	= 256'h4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5; // G.y
+	localparam	[255:0]	PZ_6	= 256'h0000000000000000000000000000000000000000000000000000000000000001;
+
+	localparam	[255:0]	RX_6	= 256'h0000000000000000000000000000000000000000000000000000000000000001;
+	localparam	[255:0]	RY_6	= 256'h0000000000000000000000000000000000000000000000000000000000000001;
+	localparam	[255:0]	RZ_6	= 256'h0000000000000000000000000000000000000000000000000000000000000000;
+
+	localparam	[255:0]	Q		= 256'hffffffff00000001000000000000000000000000ffffffffffffffffffffffff;
+		
+		
+		//
+		// Core Parameters
+		//
+	localparam	WORD_COUNTER_WIDTH	=  3;
+	localparam	OPERAND_NUM_WORDS		=  8;
+	
+
+		//
+		// Clock (100 MHz)
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+
+	
+		//
+		// Inputs, Outputs
+		//
+	reg	rst_n;
+	reg	ena;
+	wire	rdy;
+
+	
+		//
+		// Buffers (PX, PY, PZ, RX, RY, RZ, Q)
+		//
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_px_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_py_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_pz_addr;
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_rx_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_ry_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_rz_addr;
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_q_addr;
+	
+	wire										core_rx_wren;
+	wire										core_ry_wren;
+	wire										core_rz_wren;
+	
+	wire	[                32-1:0]	core_px_data;
+	wire	[                32-1:0]	core_py_data;
+	wire	[                32-1:0]	core_pz_data;
+	
+	wire	[                32-1:0]	core_rx_data_wr;
+	wire	[                32-1:0]	core_ry_data_wr;
+	wire	[                32-1:0]	core_rz_data_wr;
+	
+	wire	[                32-1:0]	core_rx_data_rd;
+	wire	[                32-1:0]	core_ry_data_rd;
+	wire	[                32-1:0]	core_rz_data_rd;	
+	
+	wire	[                32-1:0]	core_q_data;
+	
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_xyzq_addr;
+	reg										tb_xyzq_wren;
+	
+	reg	[                  31:0]	tb_px_data;
+	reg	[                  31:0]	tb_py_data;
+	reg	[                  31:0]	tb_pz_data;
+	wire	[                  31:0]	tb_rx_data;
+	wire	[                  31:0]	tb_ry_data;
+	wire	[                  31:0]	tb_rz_data;
+	reg	[                  31:0]	tb_q_data;
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_px
+	(	.clk(clk),
+		.a_addr(tb_xyzq_addr), .a_wr(tb_xyzq_wren), .a_in(tb_px_data), .a_out(),
+		.b_addr(core_px_addr), .b_out(core_px_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_py
+	(	.clk(clk),
+		.a_addr(tb_xyzq_addr), .a_wr(tb_xyzq_wren), .a_in(tb_py_data), .a_out(),
+		.b_addr(core_py_addr), .b_out(core_py_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_pz
+	(	.clk(clk),
+		.a_addr(tb_xyzq_addr), .a_wr(tb_xyzq_wren), .a_in(tb_pz_data), .a_out(),
+		.b_addr(core_pz_addr), .b_out(core_pz_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_q
+	(	.clk(clk),
+		.a_addr(tb_xyzq_addr), .a_wr(tb_xyzq_wren), .a_in(tb_q_data), .a_out(),
+		.b_addr(core_q_addr), .b_out(core_q_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_rx
+	(	.clk(clk),
+		.a_addr(core_rx_addr), .a_wr(core_rx_wren), .a_in(core_rx_data_wr), .a_out(core_rx_data_rd),
+		.b_addr(tb_xyzq_addr), .b_out(tb_rx_data)
+	);	
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_ry
+	(	.clk(clk),
+		.a_addr(core_ry_addr), .a_wr(core_ry_wren), .a_in(core_ry_data_wr), .a_out(core_ry_data_rd),
+		.b_addr(tb_xyzq_addr), .b_out(tb_ry_data)
+	);	
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_rz
+	(	.clk(clk),
+		.a_addr(core_rz_addr), .a_wr(core_rz_wren), .a_in(core_rz_data_wr), .a_out(core_rz_data_rd),
+		.b_addr(tb_xyzq_addr), .b_out(tb_rz_data)
+	);
+	
+	
+		//
+		// Opcode
+		//
+	wire	[ 5: 0]	add_uop_addr;
+	wire	[19: 0]	add_uop;
+	
+	uop_add_rom add_rom
+	(
+		.clk		(clk),
+		.addr		(add_uop_addr),
+		.data		(add_uop)
+    );
+	
+		//
+		// UUT
+		//
+	curve_dbl_add_256 uut
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(ena),
+		.rdy		(rdy),
+		
+		.uop_addr	(add_uop_addr),
+		.uop			(add_uop),
+
+		.px_addr	(core_px_addr),
+		.py_addr	(core_py_addr),
+		.pz_addr	(core_pz_addr),
+		.rx_addr	(core_rx_addr),
+		.ry_addr	(core_ry_addr),
+		.rz_addr	(core_rz_addr),
+		.q_addr	(core_q_addr),
+		
+		.rx_wren	(core_rx_wren),
+		.ry_wren	(core_ry_wren),
+		.rz_wren	(core_rz_wren),
+		
+		.px_din	(core_px_data),
+		.py_din	(core_py_data),
+		.pz_din	(core_pz_data),
+		.rx_din	(core_rx_data_rd),
+		.ry_din	(core_ry_data_rd),
+		.rz_din	(core_rz_data_rd),		
+		.rx_dout	(core_rx_data_wr),
+		.ry_dout	(core_ry_data_wr),
+		.rz_dout	(core_rz_data_wr),
+		.q_din	(core_q_data)
+	);
+		
+		
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		
+			/* initialize control inputs */
+		rst_n		= 0;
+		ena		= 0;
+		
+			/* wait for some time */
+		#200;
+		
+			/* de-assert reset */
+		rst_n		= 1;
+		
+			/* wait for some time */
+		#100;		
+		
+			/* run tests */
+		test_curve_adder(PX_1,     PY_1, PZ_1, RX_1, RY_1, RZ_1);
+		test_curve_adder(PX_2,     PY_2, PZ_2, RX_2, RY_2, RZ_2);
+		test_curve_adder(PX_3,     PY_3, PZ_3, RX_3, RY_3, RZ_3);
+		test_curve_adder(PX_4,     PY_4, PZ_4, RX_4, RY_4, RZ_4);
+		test_curve_adder(PX_5,     PY_5, PZ_5, RX_5, RY_5, RZ_5);
+		test_curve_adder(PX_6, Q - PY_6, PZ_6, RX_6, RY_6, RZ_6);
+		
+			/* print result */
+		if (ok)	$display("tb_curve_adder_256: SUCCESS");
+		else		$display("tb_curve_adder_256: FAILURE");
+		//
+		$finish;
+		//
+	end
+	
+	
+		//
+		// Test Task
+		//	
+	reg		t_ok;
+	
+	integer	w;
+
+	task test_curve_adder;
+	
+		input	[255:0]	px;
+		input	[255:0]	py;
+		input	[255:0]	pz;
+		
+		input	[255:0]	rx;
+		input	[255:0]	ry;
+		input	[255:0]	rz;		
+		
+		reg	[255:0]	px_shreg;
+		reg	[255:0]	py_shreg;
+		reg	[255:0]	pz_shreg;
+		
+		reg	[255:0]	rx_shreg;
+		reg	[255:0]	ry_shreg;
+		reg	[255:0]	rz_shreg;
+		
+		reg	[255:0]	q_shreg;
+		
+		begin
+		
+				/* start filling memories */
+			tb_xyzq_wren = 1;
+			
+				/* initialize shift registers */
+			px_shreg = px;
+			py_shreg = py;
+			pz_shreg = pz;
+			q_shreg  = Q;
+			
+				/* write all the words */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set addresses */
+				tb_xyzq_addr = w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* set data words */
+				tb_px_data	= px_shreg[31:0];
+				tb_py_data	= py_shreg[31:0];
+				tb_pz_data	= pz_shreg[31:0];
+				tb_q_data	= q_shreg[31:0];
+				
+					/* shift inputs */
+				px_shreg = {{32{1'bX}}, px_shreg[255:32]};
+				py_shreg = {{32{1'bX}}, py_shreg[255:32]};
+				pz_shreg = {{32{1'bX}}, pz_shreg[255:32]};
+				q_shreg  = {{32{1'bX}}, q_shreg[255:32]};
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+			end
+			
+				/* wipe addresses */
+			tb_xyzq_addr = {WORD_COUNTER_WIDTH{1'bX}};
+			
+				/* wipe data words */
+			tb_px_data = {32{1'bX}};
+			tb_py_data = {32{1'bX}};
+			tb_pz_data = {32{1'bX}};
+			tb_q_data  = {32{1'bX}};
+			
+				/* stop filling memories */
+			tb_xyzq_wren = 0;
+			
+				/* start operation */
+			ena = 1;
+			
+				/* clear flag */
+			#10 ena = 0;
+			
+				/* wait for operation to complete */
+			while (!rdy) #10;
+			
+				/* read result */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set address */
+				tb_xyzq_addr = w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+					/* store data word */
+				rx_shreg = {tb_rx_data, rx_shreg[255:32]};
+				ry_shreg = {tb_ry_data, ry_shreg[255:32]};
+				rz_shreg = {tb_rz_data, rz_shreg[255:32]};
+
+			end
+			
+				/* compare */
+			t_ok =	(rx_shreg == rx) &&
+						(ry_shreg == ry) &&
+						(rz_shreg == rz);
+
+				/* display results */
+			$display("test_curve_adder(): %s", t_ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && t_ok;
+		
+		end
+		
+	endtask
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_curve_doubler_256.v b/bench/tb_curve_doubler_256.v
new file mode 100644
index 0000000..c7b7541
--- /dev/null
+++ b/bench/tb_curve_doubler_256.v
@@ -0,0 +1,409 @@
+//------------------------------------------------------------------------------
+//
+// tb_curve_doubler_256.v
+// -----------------------------------------------------------------------------
+// Testbench for 256-bit curve point doubler.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+module tb_curve_doubler_256;
+
+
+		//
+		// Test Vectors
+		//
+	localparam	[255:0]	PX_1	= 256'h6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296;
+	localparam	[255:0]	PY_1	= 256'h4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5;
+	localparam	[255:0]	PZ_1	= 256'h0000000000000000000000000000000000000000000000000000000000000001;
+
+	localparam	[255:0]	RX_1	= 256'h9a978f59acd1b5ad570e7d52dcfcde43804b42274f61ddcf1e7d848391d6c70f;
+	localparam	[255:0]	RY_1	= 256'h4126885e7f786af905338238e5346d5fe77fc46388668bd0fd59be3190d2f5d1;
+	localparam	[255:0]	RZ_1	= 256'h9fc685c5fc34ff371dcfd694f81f3c2c579c66aed662bd9d976c80d06f7ea3ea;
+
+	localparam	[255:0]	PX_2	= 256'h0ec88440c8b00a9e572bf1bceb7d0c5906bd65990a9b7081130bd72e2c136ca0;
+	localparam	[255:0]	PY_2	= 256'hc0bc77e1339e899101f8e8eccf79c3f7f4bbdd1bf96f6446199bd423026a60d6;
+	localparam	[255:0]	PZ_2	= 256'hdd27cb52a31d1f6e041accf1103de05ba0a5edd74b738d51fe3397de0e3fc306;
+
+	localparam	[255:0]	RX_2	= 256'he6afae63e774df21244609cb4c35d17d28b36b8b9fb7c58929af247f34ac72f9;
+	localparam	[255:0]	RY_2	= 256'h061076db7a5745adc90b2e9eebe0ad6482309690f50b60835c265cf83a1b34eb;
+	localparam	[255:0]	RZ_2	= 256'h1b6bfd04f2a41d68e85423655db1142d97ebaec0c67c450408f427e35c4f054f;
+	
+	localparam	[255:0]	PX_3	= 256'hb0f824c88ec62df89912ca9ffbcbbb4ffb4d80f8a7d7b4a992273261a2f7be7f;
+	localparam	[255:0]	PY_3	= 256'h403e34c78c2b816fce2b1f8d73cfeef28113b8de8bda4a447d17b619bef73705;
+	localparam	[255:0]	PZ_3	= 256'h0e3e81bb8e954f3164ae54a6cffa7fcc9631dfddee55fac61e46415f1f5fe5e2;
+
+	localparam	[255:0]	RX_3	= 256'hd4e725920c88cc2f57847a315f3b6c180abb278b8fa2a47da3d1a191a8c29e19;
+	localparam	[255:0]	RY_3	= 256'ha798ad8dbd66c98b53414ab1d04b0f871929a90fea996c88b96d9d68eb8eb0dc;
+	localparam	[255:0]	RZ_3	= 256'ha7ead72c01294eaf2899bb6b84f7d26417e6758e3db29f3b5c2ca8e9911067f5;
+	
+	localparam	[255:0]	PX_4	= 256'hXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
+	localparam	[255:0]	PY_4	= 256'hXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
+	localparam	[255:0]	PZ_4	= 256'h0000000000000000000000000000000000000000000000000000000000000000;
+
+	localparam	[255:0]	RX_4	= 256'h0000000000000000000000000000000000000000000000000000000000000001;
+	localparam	[255:0]	RY_4	= 256'h0000000000000000000000000000000000000000000000000000000000000001;
+	localparam	[255:0]	RZ_4	= 256'h0000000000000000000000000000000000000000000000000000000000000000;
+
+	localparam	[255:0]	Q		= 256'hffffffff00000001000000000000000000000000ffffffffffffffffffffffff;
+
+	
+	
+		//
+		// TODO: Test special cases!
+		//
+		
+		
+		//
+		// Core Parameters
+		//
+	localparam	WORD_COUNTER_WIDTH	=  3;
+	localparam	OPERAND_NUM_WORDS		=  8;
+	
+
+		//
+		// Clock (100 MHz)
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+
+	
+		//
+		// Inputs, Outputs
+		//
+	reg	rst_n;
+	reg	ena;
+	wire	rdy;
+
+	
+		//
+		// Buffers (PX, PY, PZ, RX, RY, RZ, Q)
+		//
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_px_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_py_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_pz_addr;
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_rx_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_ry_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_rz_addr;
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_q_addr;
+	
+	wire										core_rx_wren;
+	wire										core_ry_wren;
+	wire										core_rz_wren;
+	
+	wire	[                32-1:0]	core_px_data;
+	wire	[                32-1:0]	core_py_data;
+	wire	[                32-1:0]	core_pz_data;
+	
+	wire	[                32-1:0]	core_rx_data_wr;
+	wire	[                32-1:0]	core_ry_data_wr;
+	wire	[                32-1:0]	core_rz_data_wr;
+	
+	wire	[                32-1:0]	core_rx_data_rd;
+	wire	[                32-1:0]	core_ry_data_rd;
+	wire	[                32-1:0]	core_rz_data_rd;	
+	
+	wire	[                32-1:0]	core_q_data;
+	
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_xyzq_addr;
+	reg										tb_xyzq_wren;
+	
+	reg	[                  31:0]	tb_px_data;
+	reg	[                  31:0]	tb_py_data;
+	reg	[                  31:0]	tb_pz_data;
+	wire	[                  31:0]	tb_rx_data;
+	wire	[                  31:0]	tb_ry_data;
+	wire	[                  31:0]	tb_rz_data;
+	reg	[                  31:0]	tb_q_data;
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_px
+	(	.clk(clk),
+		.a_addr(tb_xyzq_addr), .a_wr(tb_xyzq_wren), .a_in(tb_px_data), .a_out(),
+		.b_addr(core_px_addr), .b_out(core_px_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_py
+	(	.clk(clk),
+		.a_addr(tb_xyzq_addr), .a_wr(tb_xyzq_wren), .a_in(tb_py_data), .a_out(),
+		.b_addr(core_py_addr), .b_out(core_py_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_pz
+	(	.clk(clk),
+		.a_addr(tb_xyzq_addr), .a_wr(tb_xyzq_wren), .a_in(tb_pz_data), .a_out(),
+		.b_addr(core_pz_addr), .b_out(core_pz_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_q
+	(	.clk(clk),
+		.a_addr(tb_xyzq_addr), .a_wr(tb_xyzq_wren), .a_in(tb_q_data), .a_out(),
+		.b_addr(core_q_addr), .b_out(core_q_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_rx
+	(	.clk(clk),
+		.a_addr(core_rx_addr), .a_wr(core_rx_wren), .a_in(core_rx_data_wr), .a_out(core_rx_data_rd),
+		.b_addr(tb_xyzq_addr), .b_out(tb_rx_data)
+	);	
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_ry
+	(	.clk(clk),
+		.a_addr(core_ry_addr), .a_wr(core_ry_wren), .a_in(core_ry_data_wr), .a_out(core_ry_data_rd),
+		.b_addr(tb_xyzq_addr), .b_out(tb_ry_data)
+	);	
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_rz
+	(	.clk(clk),
+		.a_addr(core_rz_addr), .a_wr(core_rz_wren), .a_in(core_rz_data_wr), .a_out(core_rz_data_rd),
+		.b_addr(tb_xyzq_addr), .b_out(tb_rz_data)
+	);
+	
+	
+		//
+		// Opcode
+		//
+	wire	[ 5: 0]	dbl_uop_addr;
+	wire	[19: 0]	dbl_uop;
+	
+	uop_dbl_rom dbl_rom
+	(
+		.clk		(clk),
+		.addr		(dbl_uop_addr),
+		.data		(dbl_uop)
+    );
+	
+	
+		//
+		// UUT
+		//
+	curve_dbl_add_256 uut
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(ena),
+		.rdy		(rdy),
+		
+		.uop_addr	(dbl_uop_addr),
+		.uop			(dbl_uop),
+
+		.px_addr	(core_px_addr),
+		.py_addr	(core_py_addr),
+		.pz_addr	(core_pz_addr),
+		.rx_addr	(core_rx_addr),
+		.ry_addr	(core_ry_addr),
+		.rz_addr	(core_rz_addr),
+		.q_addr	(core_q_addr),
+		
+		.rx_wren	(core_rx_wren),
+		.ry_wren	(core_ry_wren),
+		.rz_wren	(core_rz_wren),
+		
+		.px_din	(core_px_data),
+		.py_din	(core_py_data),
+		.pz_din	(core_pz_data),
+		.rx_din	(core_rx_data_rd),
+		.ry_din	(core_ry_data_rd),
+		.rz_din	(core_rz_data_rd),		
+		.rx_dout	(core_rx_data_wr),
+		.ry_dout	(core_ry_data_wr),
+		.rz_dout	(core_rz_data_wr),
+		.q_din	(core_q_data)
+	);
+		
+		
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		
+			/* initialize control inputs */
+		rst_n		= 0;
+		ena		= 0;
+		
+			/* wait for some time */
+		#200;
+		
+			/* de-assert reset */
+		rst_n		= 1;
+		
+			/* wait for some time */
+		#100;		
+		
+			/* run tests */
+		test_curve_doubler(PX_1, PY_1, PZ_1, RX_1, RY_1, RZ_1);
+		test_curve_doubler(PX_2, PY_2, PZ_2, RX_2, RY_2, RZ_2);
+		test_curve_doubler(PX_3, PY_3, PZ_3, RX_3, RY_3, RZ_3);
+		test_curve_doubler(PX_4, PY_4, PZ_4, RX_4, RY_4, RZ_4);
+		
+			/* print result */
+		if (ok)	$display("tb_curve_doubler_256: SUCCESS");
+		else		$display("tb_curve_doubler_256: FAILURE");
+		//
+//		$finish;
+		//
+	end
+	
+	
+		//
+		// Test Task
+		//	
+	reg		t_ok;
+	
+	integer	w;
+
+	task test_curve_doubler;
+	
+		input	[255:0]	px;
+		input	[255:0]	py;
+		input	[255:0]	pz;
+		
+		input	[255:0]	rx;
+		input	[255:0]	ry;
+		input	[255:0]	rz;		
+		
+		reg	[255:0]	px_shreg;
+		reg	[255:0]	py_shreg;
+		reg	[255:0]	pz_shreg;
+		
+		reg	[255:0]	rx_shreg;
+		reg	[255:0]	ry_shreg;
+		reg	[255:0]	rz_shreg;
+		
+		reg	[255:0]	q_shreg;
+		
+		begin
+		
+				/* start filling memories */
+			tb_xyzq_wren = 1;
+			
+				/* initialize shift registers */
+			px_shreg = px;
+			py_shreg = py;
+			pz_shreg = pz;
+			q_shreg  = Q;
+			
+				/* write all the words */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set addresses */
+				tb_xyzq_addr = w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* set data words */
+				tb_px_data	= px_shreg[31:0];
+				tb_py_data	= py_shreg[31:0];
+				tb_pz_data	= pz_shreg[31:0];
+				tb_q_data	= q_shreg[31:0];
+				
+					/* shift inputs */
+				px_shreg = {{32{1'bX}}, px_shreg[255:32]};
+				py_shreg = {{32{1'bX}}, py_shreg[255:32]};
+				pz_shreg = {{32{1'bX}}, pz_shreg[255:32]};
+				q_shreg  = {{32{1'bX}}, q_shreg[255:32]};
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+			end
+			
+				/* wipe addresses */
+			tb_xyzq_addr = {WORD_COUNTER_WIDTH{1'bX}};
+			
+				/* wipe data words */
+			tb_px_data = {32{1'bX}};
+			tb_py_data = {32{1'bX}};
+			tb_pz_data = {32{1'bX}};
+			tb_q_data  = {32{1'bX}};
+			
+				/* stop filling memories */
+			tb_xyzq_wren = 0;
+			
+				/* start operation */
+			ena = 1;
+			
+				/* clear flag */
+			#10 ena = 0;
+			
+				/* wait for operation to complete */
+			while (!rdy) #10;
+			
+				/* read result */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set address */
+				tb_xyzq_addr = w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+					/* store data word */
+				rx_shreg = {tb_rx_data, rx_shreg[255:32]};
+				ry_shreg = {tb_ry_data, ry_shreg[255:32]};
+				rz_shreg = {tb_rz_data, rz_shreg[255:32]};
+
+			end
+			
+				/* compare */
+			t_ok =	(rx_shreg == rx) &&
+						(ry_shreg == ry) &&
+						(rz_shreg == rz);
+
+				/* display results */
+			$display("test_curve_doubler(): %s", t_ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && t_ok;
+		
+		end
+		
+	endtask
+	
+      
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_curve_multiplier_256.v b/bench/tb_curve_multiplier_256.v
new file mode 100644
index 0000000..bcca034
--- /dev/null
+++ b/bench/tb_curve_multiplier_256.v
@@ -0,0 +1,281 @@
+//------------------------------------------------------------------------------
+//
+// tb_curve_multiplier_256.v
+// -----------------------------------------------------------------------------
+// Testbench for 256-bit curve point scalar multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+module tb_curve_multiplier_256;
+
+
+		//
+		// Test Vectors
+		//
+	localparam	[255:0]	K_1	= 256'h70a12c2db16845ed56ff68cfc21a472b3f04d7d6851bf6349f2d7d5b3452b38a;
+	localparam	[255:0]	PX_1	= 256'h8101ece47464a6ead70cf69a6e2bd3d88691a3262d22cba4f7635eaff26680a8;
+	localparam	[255:0]	PY_1	= 256'hd8a12ba61d599235f67d9cb4d58f1783d3ca43e78f0a5abaa624079936c0c3a9;
+
+	localparam	[255:0]	K_2	= 256'h580ec00d856434334cef3f71ecaed4965b12ae37fa47055b1965c7b134ee45d0;
+	localparam	[255:0]	PX_2	= 256'h7214bc9647160bbd39ff2f80533f5dc6ddd70ddf86bb815661e805d5d4e6f27c;
+	localparam	[255:0]	PY_2	= 256'h8b81e3e977597110c7cf2633435b2294b72642987defd3d4007e1cfc5df84541;
+
+	localparam	[255:0]	K_3	= 256'hffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551;
+	localparam	[255:0]	PX_3	= 256'h0000000000000000000000000000000000000000000000000000000000000000;
+	localparam	[255:0]	PY_3	= 256'h0000000000000000000000000000000000000000000000000000000000000000;
+		
+		
+		//
+		// Core Parameters
+		//
+	localparam	WORD_COUNTER_WIDTH	=  3;
+	localparam	OPERAND_NUM_WORDS		=  8;
+	
+
+		//
+		// Clock (100 MHz)
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+
+	
+		//
+		// Inputs, Outputs
+		//
+	reg	rst_n;
+	reg	ena;
+	wire	rdy;
+
+	
+		//
+		// Buffers (K, PX, PY)
+		//
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_k_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_px_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_py_addr;
+	
+	wire										core_px_wren;
+	wire										core_py_wren;
+	
+	wire	[                32-1:0]	core_k_data;
+	wire	[                32-1:0]	core_px_data;
+	wire	[                32-1:0]	core_py_data;
+	
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_k_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_pxy_addr;
+	
+	reg										tb_k_wren;
+	
+	reg	[                  31:0]	tb_k_data;
+	wire	[                  31:0]	tb_px_data;
+	wire	[                  31:0]	tb_py_data;
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_k
+	(	.clk(clk),
+		.a_addr(tb_k_addr), .a_wr(tb_k_wren), .a_in(tb_k_data), .a_out(),
+		.b_addr(core_k_addr), .b_out(core_k_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_px
+	(	.clk(clk),
+		.a_addr(core_px_addr), .a_wr(core_px_wren), .a_in(core_px_data), .a_out(),
+		.b_addr(tb_pxy_addr), .b_out(tb_px_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_py
+	(	.clk(clk),
+		.a_addr(core_py_addr), .a_wr(core_py_wren), .a_in(core_py_data), .a_out(),
+		.b_addr(tb_pxy_addr), .b_out(tb_py_data)
+	);
+	
+	
+		//
+		// UUT
+		//
+	curve_mul_256 uut
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(ena),
+		.rdy			(rdy),
+		
+		.k_addr		(core_k_addr),
+		.rx_addr		(core_px_addr),
+		.ry_addr		(core_py_addr),
+		
+		.rx_wren		(core_px_wren),
+		.ry_wren		(core_py_wren),
+		
+		.k_din		(core_k_data),
+		
+		.rx_dout		(core_px_data),
+		.ry_dout		(core_py_data)
+	);
+		
+		
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		
+			/* initialize control inputs */
+		rst_n		= 0;
+		ena		= 0;
+		
+			/* wait for some time */
+		#200;
+		
+			/* de-assert reset */
+		rst_n		= 1;
+		
+			/* wait for some time */
+		#100;		
+		
+			/* run tests */
+		test_curve_multiplier(K_1, PX_1, PY_1);
+		test_curve_multiplier(K_2, PX_2, PY_2);
+		test_curve_multiplier(K_3, PX_3, PY_3);
+		
+			/* print result */
+		if (ok)	$display("tb_curve_multiplier_256: SUCCESS");
+		else		$display("tb_curve_multiplier_256: FAILURE");
+		//
+		//$finish;
+		//
+	end
+	
+	
+		//
+		// Test Task
+		//	
+	reg		p_ok;
+	
+	integer	w;
+
+	task test_curve_multiplier;
+	
+		input	[255:0]	k;
+		input	[255:0]	px;
+		input	[255:0]	py;
+		
+		reg	[255:0]	k_shreg;
+		reg	[255:0]	px_shreg;
+		reg	[255:0]	py_shreg;
+		
+		begin
+		
+				/* start filling memories */
+			tb_k_wren = 1;
+			
+				/* initialize shift registers */
+			k_shreg = k;
+			
+				/* write all the words */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set addresses */
+				tb_k_addr = w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* set data words */
+				tb_k_data	= k_shreg[31:0];
+				
+					/* shift inputs */
+				k_shreg = {{32{1'bX}}, k_shreg[255:32]};
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+			end
+			
+				/* wipe addresses */
+			tb_k_addr = {WORD_COUNTER_WIDTH{1'bX}};
+			
+				/* wipe data words */
+			tb_k_data = {32{1'bX}};
+			
+				/* stop filling memories */
+			tb_k_wren = 0;
+			
+				/* start operation */
+			ena = 1;
+			
+				/* clear flag */
+			#10 ena = 0;
+			
+				/* wait for operation to complete */
+			while (!rdy) #10;
+			
+				/* read result */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set address */
+				tb_pxy_addr = w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+					/* store data word */
+				px_shreg = {tb_px_data, px_shreg[255:32]};
+				py_shreg = {tb_py_data, py_shreg[255:32]};
+
+			end
+			
+				/* compare */
+			p_ok =	(px_shreg == px) &&
+						(py_shreg == py);
+
+				/* display results */
+			$display("test_curve_multiplier(): %s", p_ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && p_ok;
+		
+		end
+		
+	endtask
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_lowlevel_adder32.v b/bench/tb_lowlevel_adder32.v
new file mode 100644
index 0000000..2caffbd
--- /dev/null
+++ b/bench/tb_lowlevel_adder32.v
@@ -0,0 +1,175 @@
+//------------------------------------------------------------------------------
+//
+// tb_lowlevel_adder32.v
+// -----------------------------------------------------------------------------
+// Testbench for 32-bit adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+
+module tb_lowlevel_adder32;
+
+		//
+		// Inputs
+		//
+	reg				clk;
+	reg	[31: 0]	a;	
+	reg	[31: 0]	b;
+	reg				c_in;
+
+		//
+		// Outputs
+		//
+	wire	[31: 0]	s;	
+	wire				c_out;
+
+		//
+		// Test Vectors {a, b, c_in}
+		//
+	wire	[64: 0]	vec_0		= {32'h00000000, 32'h00000000, 1'b0};	// all zeroes, no carry
+	wire	[64: 0]	vec_1		= {32'h00000000, 32'h00000000, 1'b1};	// all zeroes with carry
+	wire	[64: 0]	vec_2		= {32'h00000000, 32'hFFFFFFFF, 1'b0};	// zeroes and ones, no carry
+	wire	[64: 0]	vec_3		= {32'h00000000, 32'hFFFFFFFF, 1'b1};	// zeroes and ones with carry
+	wire	[64: 0]	vec_4		= {32'hFFFFFFFF, 32'h00000000, 1'b0};	// ones and zeroes, no carry
+	wire	[64: 0]	vec_5		= {32'hFFFFFFFF, 32'h00000000, 1'b1};	// ones and zeroes with carry
+	wire	[64: 0]	vec_6		= {32'hFFFFFFFF, 32'hFFFFFFFF, 1'b0};	// all ones, no carry
+	wire	[64: 0]	vec_7		= {32'hFFFFFFFF, 32'hFFFFFFFF, 1'b1};	// all ones with carry
+	
+	wire	[64: 0]	vec_8		= {32'hd898c296, 32'h37bf51f5, 1'b0};	// random values, no carry
+	wire	[64: 0]	vec_9		= {32'hf4a13945, 32'hcbb64068, 1'b0};	// random values, no carry
+	wire	[64: 0]	vec_10	= {32'h2deb33a0, 32'h6b315ece, 1'b0};	// random values, no carry
+	wire	[64: 0]	vec_11	= {32'h77037d81, 32'h2bce3357, 1'b0};	// random values, no carry
+	wire	[64: 0]	vec_12	= {32'h63a440f2, 32'h7c0f9e16, 1'b1};	// random values with carry
+	wire	[64: 0]	vec_13	= {32'hf8bce6e5, 32'h8ee7eb4a, 1'b1};	// random values with carry
+	wire	[64: 0]	vec_14	= {32'he12c4247, 32'hfe1a7f9b, 1'b1};	// random values with carry
+	wire	[64: 0]	vec_15	= {32'h6b17d1f2, 32'h4fe342e2, 1'b1};	// random values with carry
+	
+
+		//
+		// UUT
+		//
+	adder32_wrapper uut
+	(
+		.clk		(clk),
+		.a			(a),
+		.b			(b),
+		.s			(s),
+		.c_in		(c_in),
+		.c_out	(c_out)
+	);
+
+
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		//
+		clk = 0;
+		//
+		#100;
+		//
+		test_adder32(vec_0);
+		test_adder32(vec_1);
+		test_adder32(vec_2);
+		test_adder32(vec_3);
+		test_adder32(vec_4);
+		test_adder32(vec_5);
+		test_adder32(vec_6);
+		test_adder32(vec_7);
+		//
+		test_adder32(vec_8);
+		test_adder32(vec_9);
+		test_adder32(vec_10);
+		test_adder32(vec_11);
+		test_adder32(vec_12);
+		test_adder32(vec_13);
+		test_adder32(vec_14);
+		test_adder32(vec_15);
+		//
+		if (ok)	$display("tb_lowlevel_adder32: SUCCESS");
+		else		$display("tb_lowlevel_adder32: FAILURE");
+		//
+		$finish;
+		//
+	end
+      
+		
+		//
+		// Test Routine
+		//
+	reg	[31: 0]	ss;		// reference value of sum
+	reg				cc;		// reference value of carry
+	reg				ss_ok;	// result matches reference value
+	
+	task test_adder32;
+
+		input	[64: 0] vec;
+
+		begin
+				
+				/* break down test vector */
+			a		= vec[64:33];
+			b		= vec[32: 1];
+			c_in	= vec[ 0: 0];
+			
+				/* calculate reference values */
+			{cc, ss} = {1'b0, a} + {1'b0, b} + {32'd0, c_in};
+			
+				/* send one clock tick */
+			#10 clk = 1;
+			#10 clk = 0;
+			
+				/* check outputs */
+			ss_ok = (s == ss) && (c_out == cc);
+
+				/* display results */
+			$display("test_adder32(): 0x%08X + 0x%08X + %01d = {%01d, 0x%08X} [%0s]", a, b, c_in, c_out, s, ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && ss_ok;
+
+		end
+
+	endtask
+		
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_lowlevel_adder47.v b/bench/tb_lowlevel_adder47.v
new file mode 100644
index 0000000..2a575a8
--- /dev/null
+++ b/bench/tb_lowlevel_adder47.v
@@ -0,0 +1,151 @@
+//------------------------------------------------------------------------------
+//
+// tb_lowlevel_adder47.v
+// -----------------------------------------------------------------------------
+// Testbench for 47-bit adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+
+module tb_lowlevel_adder47;
+
+		//
+		// Inputs
+		//
+	reg				clk;
+	reg	[46: 0]	a;	
+	reg	[46: 0]	b;
+
+		//
+		// Outputs
+		//
+	wire	[46: 0]	s;	
+
+		//
+		// Test Vectors {a, b}
+		//
+	wire	[93: 0]	vec_0		= {47'h2a87ca22be8b, 47'h05378eb1c71e};
+	wire	[93: 0]	vec_1		= {47'h7320ad746e1d, 47'h3b628ba79b98};
+	wire	[93: 0]	vec_2		= {47'h59f741e08254, 47'h2a385502f25d};
+	wire	[93: 0]	vec_3		= {47'h3f55296c3a54, 47'h5e3872760ab7};
+	wire	[93: 0]	vec_4		= {47'h3617de4a9626, 47'h2c6f5d9e98bf};
+	wire	[93: 0]	vec_5		= {47'h1292dc29f8f4, 47'h1dbd289a147c};
+	wire	[93: 0]	vec_6		= {47'h69da3113b5f0, 47'h38c00a60b1ce};
+	wire	[93: 0]	vec_7		= {47'h1d7e819d7a43, 47'h1d7c90ea0e5f};
+	
+		//
+		// UUT
+		//
+	adder47_wrapper uut
+	(
+		.clk		(clk),
+		.a			(a),
+		.b			(b),
+		.s			(s)
+	);
+
+
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		//
+		clk = 0;
+		//
+		#100;
+		//
+		test_adder47(vec_0);
+		test_adder47(vec_1);
+		test_adder47(vec_2);
+		test_adder47(vec_3);
+		test_adder47(vec_4);
+		test_adder47(vec_5);
+		test_adder47(vec_6);
+		test_adder47(vec_7);
+		//
+		if (ok)	$display("tb_lowlevel_adder47: SUCCESS");
+		else		$display("tb_lowlevel_adder47: FAILURE");
+		//
+		$finish;
+		//
+	end
+      
+		
+		//
+		// Test Routine
+		//
+	reg	[46: 0]	ss;		// reference value of sum
+	reg				cc;		// reference value of carry
+	reg				ss_ok;	// result matches reference value
+	
+	task test_adder47;
+
+		input	[93: 0] vec;
+
+		begin
+				
+				/* break down test vector */
+			a		= vec[93:47];
+			b		= vec[46: 0];
+			
+				/* calculate reference values */
+			ss = a + b;
+			
+				/* send one clock tick */
+			#10 clk = 1;
+			#10 clk = 0;
+			
+				/* check outputs */
+			ss_ok = (s == ss);
+
+				/* display results */
+			$display("test_adder47(): %s", ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && ss_ok;
+
+		end
+
+	endtask
+		
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_lowlevel_subtractor32.v b/bench/tb_lowlevel_subtractor32.v
new file mode 100644
index 0000000..e1129e2
--- /dev/null
+++ b/bench/tb_lowlevel_subtractor32.v
@@ -0,0 +1,174 @@
+//------------------------------------------------------------------------------
+//
+// tb_lowlevel_subtractor32.v
+// -----------------------------------------------------------------------------
+// Testbench for 32-bit subtractor.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+module tb_lowlevel_subtractor32;
+
+		//
+		// Inputs
+		//
+	reg				clk;
+	reg	[31: 0]	a;
+	reg	[31: 0]	b;
+	reg				b_in;
+
+		//
+		// Outputs
+		//
+	wire	[31: 0]	d;
+	wire				b_out;
+
+		//
+		// Test Vectors {a, b, b_in}
+		//
+	wire	[64: 0]	vec_0		= {32'h00000000, 32'h00000000, 1'b0};	// all zeroes, no borrow
+	wire	[64: 0]	vec_1		= {32'h00000000, 32'h00000000, 1'b1};	// all zeroes with borrow
+	wire	[64: 0]	vec_2		= {32'h00000000, 32'hFFFFFFFF, 1'b0};	// zeroes and ones, no borrow
+	wire	[64: 0]	vec_3		= {32'h00000000, 32'hFFFFFFFF, 1'b1};	// zeroes and ones with borrow
+	wire	[64: 0]	vec_4		= {32'hFFFFFFFF, 32'h00000000, 1'b0};	// ones and zeroes, no borrow
+	wire	[64: 0]	vec_5		= {32'hFFFFFFFF, 32'h00000000, 1'b1};	// ones and zeroes with borrow
+	wire	[64: 0]	vec_6		= {32'hFFFFFFFF, 32'hFFFFFFFF, 1'b0};	// all ones, no borrow
+	wire	[64: 0]	vec_7		= {32'hFFFFFFFF, 32'hFFFFFFFF, 1'b1};	// all ones with borrow
+	
+	wire	[64: 0]	vec_8		= {32'hd898c296, 32'h37bf51f5, 1'b0};	// random values, no borrow
+	wire	[64: 0]	vec_9		= {32'hf4a13945, 32'hcbb64068, 1'b0};	// random values, no borrow
+	wire	[64: 0]	vec_10	= {32'h2deb33a0, 32'h6b315ece, 1'b0};	// random values, no borrow
+	wire	[64: 0]	vec_11	= {32'h77037d81, 32'h2bce3357, 1'b0};	// random values, no borrow
+	wire	[64: 0]	vec_12	= {32'h63a440f2, 32'h7c0f9e16, 1'b1};	// random values with borrow
+	wire	[64: 0]	vec_13	= {32'hf8bce6e5, 32'h8ee7eb4a, 1'b1};	// random values with borrow
+	wire	[64: 0]	vec_14	= {32'he12c4247, 32'hfe1a7f9b, 1'b1};	// random values with borrow
+	wire	[64: 0]	vec_15	= {32'h6b17d1f2, 32'h4fe342e2, 1'b1};	// random values with borrow
+
+
+		//
+		// UUT
+		//
+	subtractor32_wrapper uut
+	(
+		.clk		(clk),
+		.a			(a),
+		.b			(b),
+		.d			(d),
+		.b_in		(b_in),
+		.b_out	(b_out)
+	);
+
+	
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		//
+		clk = 0;
+		//
+		#100;
+		//
+		test_subtractor32(vec_0);
+		test_subtractor32(vec_1);
+		test_subtractor32(vec_2);
+		test_subtractor32(vec_3);
+		test_subtractor32(vec_4);
+		test_subtractor32(vec_5);
+		test_subtractor32(vec_6);
+		test_subtractor32(vec_7);
+		//
+		test_subtractor32(vec_8);
+		test_subtractor32(vec_9);
+		test_subtractor32(vec_10);
+		test_subtractor32(vec_11);
+		test_subtractor32(vec_12);
+		test_subtractor32(vec_13);
+		test_subtractor32(vec_14);
+		test_subtractor32(vec_15);
+		//
+		if (ok)	$display("tb_lowlevel_subtractor32: SUCCESS");
+		else		$display("tb_lowlevel_subtractor32: FAILURE");
+		//
+		$finish;
+		//
+	end
+      
+		
+		//
+		// Test Routine
+		//
+	reg	[31: 0]	dd;		// reference value of difference
+	reg				bb;		// reference value of borrow
+	reg				dd_ok;	// result matches reference value
+	
+	task test_subtractor32;
+
+		input	[64: 0] vec;
+
+		begin
+		
+				/* break down test vector */
+			a		= vec[64:33];
+			b		= vec[32: 1];
+			b_in	= vec[ 0: 0];
+
+				/* calculate reference values */
+			{bb, dd} = {1'b0, a} - {1'b0, b} - {32'd0, b_in};
+
+				/* send one clock tick */
+			#10 clk = 1;
+			#10 clk = 0;
+			
+				/* check outputs */
+			dd_ok = (d == dd) && (b_out == bb);
+
+				/* display results */
+			$display("test_subtractor32(): 0x%08X - (0x%08X + %01d) = {%01d, 0x%08X} [%0s]", a, b, b_in, b_out, d, dd_ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && dd_ok;
+			
+		end
+		
+	endtask
+		
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_modular_adder.v b/bench/tb_modular_adder.v
new file mode 100644
index 0000000..1015b77
--- /dev/null
+++ b/bench/tb_modular_adder.v
@@ -0,0 +1,357 @@
+//------------------------------------------------------------------------------
+//
+// tb_modular_adder_256.v
+// -----------------------------------------------------------------------------
+// Testbench for modular multi-word adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+module tb_modular_adder_256;
+
+
+		//
+		// Test Vectors
+		//	
+	localparam	[255:0]	N		= 256'hffffffff00000001000000000000000000000000ffffffffffffffffffffffff;
+		
+	localparam	[255:0]	X_1	= 256'h1ddbd0769df27bab1e234019dad09dccce1e87e2193b417ffa1a3465d7439ecd;
+	localparam	[255:0]	Y_1	= 256'h1f67cdc34bac91a072945d212f0a03442fc4855788583ecb7b2e375ad3848210;
+	
+	localparam	[255:0]	X_2	= 256'hff563f653b1392a6fa6b0295a280f7a904a11e22d8ae468e220301d8ac232fcf;
+	localparam	[255:0]	Y_2	= 256'hf6f53c4b57b25453b68e923fb118e4f753d74af01fc58476dd15a80933453899;
+	
+	
+		//
+		// Core Parameters
+		//
+	localparam	WORD_COUNTER_WIDTH	=  3;
+	localparam	OPERAND_NUM_WORDS		=  8;
+	
+
+		//
+		// Clock (100 MHz)
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+
+	
+		//
+		// Inputs, Outputs
+		//
+	reg				rst_n;
+	reg				ena;
+	wire				rdy;
+	
+	
+		//
+		// Buffers (X, Y, N)
+		//
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_xy_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_n_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_s_addr;
+	wire										core_s_wren;
+	
+	wire	[                  31:0]	core_x_data;
+	wire	[                  31:0]	core_y_data;
+	wire	[                  31:0]	core_n_data;
+	wire	[                  31:0]	core_s_data;
+	
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_xyn_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_s_addr;
+	reg										tb_xyn_wren;
+	
+	reg	[                  31:0]	tb_x_data;
+	reg	[                  31:0]	tb_y_data;
+	reg	[                  31:0]	tb_n_data;
+	wire	[                  31:0]	tb_s_data;
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_x
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xyn_addr),
+		.a_wr		(tb_xyn_wren),
+		.a_in		(tb_x_data),
+		.a_out	(),
+
+		.b_addr	(core_xy_addr),
+		.b_out	(core_x_data)
+	);
+
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_y
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xyn_addr),
+		.a_wr		(tb_xyn_wren),
+		.a_in		(tb_y_data),
+		.a_out	(),
+
+		.b_addr	(core_xy_addr),
+		.b_out	(core_y_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_n
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xyn_addr),
+		.a_wr		(tb_xyn_wren),
+		.a_in		(tb_n_data),
+		.a_out	(),
+
+		.b_addr	(core_n_addr),
+		.b_out	(core_n_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_s
+	(
+		.clk		(clk),
+
+		.a_addr	(core_s_addr),
+		.a_wr		(core_s_wren),
+		.a_in		(core_s_data),
+		.a_out	(),
+
+		.b_addr	(tb_s_addr),
+		.b_out	(tb_s_data)
+	);
+	
+	
+		//
+		// UUT
+		//
+	modular_adder #
+	(
+		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH),
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS)
+	)
+	uut
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(ena),
+		.rdy			(rdy),
+		
+		.ab_addr		(core_xy_addr),
+		.n_addr		(core_n_addr),
+		.s_addr		(core_s_addr),
+		.s_wren		(core_s_wren),
+		
+		.a_din		(core_x_data),
+		.b_din		(core_y_data),
+		.n_din		(core_n_data),
+		.s_dout		(core_s_data)
+	);
+		
+		
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		
+			/* initialize control inputs */
+		rst_n			= 0;
+		ena			= 0;
+		
+		tb_xyn_wren	= 0;
+		
+			/* wait for some time */
+		#200;
+		
+			/* de-assert reset */
+		rst_n		= 1;
+		
+			/* wait for some time */
+		#100;		
+		
+			/* run tests */
+		test_modular_adder(X_1, Y_1, N);
+		test_modular_adder(X_2, Y_2, N);
+		test_modular_adder(Y_1, X_1, N);
+		test_modular_adder(Y_2, X_2, N);
+
+		test_modular_adder(X_1, X_2, N);
+		test_modular_adder(X_2, X_1, N);
+		test_modular_adder(Y_1, Y_2, N);
+		test_modular_adder(Y_2, Y_1, N);
+		
+		test_modular_adder(X_1, Y_2, N);
+		test_modular_adder(Y_2, X_1, N);
+		test_modular_adder(X_2, Y_1, N);
+		test_modular_adder(Y_1, X_2, N);		
+		
+			/* print result */
+		if (ok)	$display("tb_modular_adder_256: SUCCESS");
+		else		$display("tb_modular_adder_256: FAILURE");
+		//
+		$finish;
+		//
+	end
+	
+	
+		//
+		// Test Task
+		//	
+	reg	[256:0]	s;
+	wire	[255:0]	s_dummy = s[255:0];
+	reg				s_ok;
+
+	integer			w;
+	
+	reg	[255:0]	x_shreg;
+	reg	[255:0]	y_shreg;
+	reg	[255:0]	n_shreg;
+	reg	[255:0]	s_shreg;
+	
+	task test_modular_adder;
+	
+		input	[255:0]	x;
+		input	[255:0]	y;
+		input	[255:0]	n;
+				
+		begin
+		
+				/* start filling memories */
+			tb_xyn_wren	= 1;
+			
+				/* initialize shift registers */
+			x_shreg = x;
+			y_shreg = y;
+			n_shreg = n;
+			
+				/* write all the words */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set addresses */
+				tb_xyn_addr	= w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* set data words */
+				tb_x_data	= x_shreg[31:0];
+				tb_y_data	= y_shreg[31:0];
+				tb_n_data	= n_shreg[31:0];
+				
+					/* shift inputs */
+				x_shreg = {{32{1'bX}}, x_shreg[255:32]};
+				y_shreg = {{32{1'bX}}, y_shreg[255:32]};
+				n_shreg = {{32{1'bX}}, n_shreg[255:32]};
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+			end
+			
+				/* wipe addresses */
+			tb_xyn_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+			
+				/* wipe data words */
+			tb_x_data	= {32{1'bX}};
+			tb_y_data	= {32{1'bX}};
+			tb_n_data	= {32{1'bX}};
+			
+				/* stop filling memories */
+			tb_xyn_wren	= 0;
+						
+				/* calculate reference value */
+			s = {1'b0, x} + {1'b0, y};
+			if (s >= {1'b0, n})
+				s = s - {1'b0, n};
+			
+				/* start operation */
+			ena = 1;
+			
+				/* clear flag */
+			#10 ena = 0;
+			
+				/* wait for operation to complete */
+			while (!rdy) #10;
+			
+				/* read result */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set address */
+				tb_s_addr	= w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+					/* store data word */
+				s_shreg = {tb_s_data, s_shreg[255:32]};
+
+			end				
+			
+				/* compare */
+			s_ok = (s_shreg == s[255:0]);
+
+				/* display results */
+			$display("test_modular_adder(): %s", s_ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && s_ok;
+		
+		end
+		
+	endtask
+	
+      
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_modular_invertor.v b/bench/tb_modular_invertor.v
new file mode 100644
index 0000000..0ef7c88
--- /dev/null
+++ b/bench/tb_modular_invertor.v
@@ -0,0 +1,226 @@
+`timescale 1ns / 1ps
+
+module tb_modular_invertor;
+
+
+		//
+		// Test Vectors
+		//
+	localparam	[255:0]	Q		= 256'hffffffff00000001000000000000000000000000ffffffffffffffffffffffff;
+	
+	localparam	[255:0]	A_1 	= 256'hd3e73ccd63a5b10da308c615bb9ebd3f76e2c5fccc256fd9f629dcc956bf2382;
+	localparam	[255:0]	A1_1	= 256'h93fb26d5d199bbb7232a4b7c98e97ba9bb7530d304b5f07736ea4027bbb57ecd;
+
+	localparam	[255:0]	A_2 	= 256'h57b6c628a5c4e870740b2517975ace2216acbe094ac54568b53212ef45e69d22;
+	localparam	[255:0]	A1_2	= 256'hcd2af4766642d7d2f3f3f67d92c575c496772ef7d55c75eb46bd07e8d5f9a4aa;
+		
+		
+		//
+		// Clock
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+	
+	
+		//
+		// Inputs, Outputs
+		//
+	reg	rst_n;
+	reg	ena;
+	wire	rdy;
+	
+	
+		//
+		// Buffers (A, A1, Q)
+		//
+	wire	[ 2: 0]	core_a_addr;
+	wire	[ 2: 0]	core_q_addr;
+	wire	[ 2: 0]	core_a1_addr;
+	wire				core_a1_wren;
+	
+	wire	[31: 0]	core_a_data;
+	wire	[31: 0]	core_q_data;
+	wire	[31: 0]	core_a1_data;
+	
+	reg	[ 2: 0]	tb_aq_addr;
+	reg				tb_aq_wren;	
+	reg	[ 2: 0]	tb_a1_addr;
+	
+	reg	[31: 0]	tb_a_data;
+	reg	[31: 0]	tb_q_data;
+	wire	[31: 0]	tb_a1_data;
+
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_a
+	(	.clk(clk),
+		.a_addr(tb_aq_addr), .a_wr(tb_aq_wren), .a_in(tb_a_data), .a_out(),
+		.b_addr(core_a_addr), .b_out(core_a_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_q
+	(	.clk(clk),
+		.a_addr(tb_aq_addr), .a_wr(tb_aq_wren), .a_in(tb_q_data), .a_out(),
+		.b_addr(core_q_addr), .b_out(core_q_data)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_a1
+	(	.clk(clk),
+		.a_addr(core_a1_addr), .a_wr(core_a1_wren), .a_in(core_a1_data), .a_out(),
+		.b_addr(tb_a1_addr), .b_out(tb_a1_data)
+	);
+	
+	
+		//
+		// UUT
+		//
+	modular_invertor #
+	(
+		.MAX_OPERAND_WIDTH	(256)
+	)
+	uut
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(ena),
+		.rdy		(rdy),
+		
+		.a_addr	(core_a_addr),
+		.q_addr	(core_q_addr),
+		.a1_addr	(core_a1_addr),
+		.a1_wren	(core_a1_wren),
+		
+		.a_din	(core_a_data),
+		.q_din	(core_q_data),
+		.a1_dout	(core_a1_data)
+	);
+
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		
+				/* initialize control inputs */
+		rst_n		= 0;
+		ena		= 0;
+		
+			/* wait for some time */
+		#200;
+		
+			/* de-assert reset */
+		rst_n		= 1;
+		
+			/* wait for some time */
+		#100;		
+		
+			/* run tests */
+		test_modular_invertor(A_1, A1_1, Q);
+		test_modular_invertor(A_2, A1_2, Q);
+		
+			/* print result */
+		if (ok)	$display("tb_modular_invertor: SUCCESS");
+		else		$display("tb_modular_invertor: FAILURE");
+		//
+		//$finish;
+		//
+
+	end
+	
+	
+      //
+		// Test Task
+		//	
+	reg		a1_ok;
+	
+	integer	w;
+
+	task test_modular_invertor;
+	
+		input	[255:0]	a;
+		input	[255:0]	a1;
+		input	[255:0]	q;
+				
+		reg	[255:0]	a_shreg;
+		reg	[255:0]	a1_shreg;
+		reg	[255:0]	q_shreg;
+		
+		begin
+		
+				/* start filling memories */
+			tb_aq_wren = 1;
+			
+				/* initialize shift registers */
+			a_shreg = a;
+			q_shreg = q;
+			
+				/* write all the words */
+			for (w=0; w<8; w=w+1) begin
+				
+					/* set addresses */
+				tb_aq_addr = w[2:0];
+				
+					/* set data words */
+				tb_a_data	= a_shreg[31:0];
+				tb_q_data	= q_shreg[31:0];
+				
+					/* shift inputs */
+				a_shreg = {{32{1'bX}}, a_shreg[255:32]};
+				q_shreg = {{32{1'bX}}, q_shreg[255:32]};
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+			end
+			
+				/* wipe addresses */
+			tb_aq_addr = {3{1'bX}};
+			
+				/* wipe data words */
+			tb_a_data = {32{1'bX}};
+			tb_q_data = {32{1'bX}};
+			
+				/* stop filling memories */
+			tb_aq_wren = 0;
+			
+				/* start operation */
+			ena = 1;
+			
+				/* clear flag */
+			#10 ena = 0;
+			
+				/* wait for operation to complete */
+			while (!rdy) #10;
+
+				/* read result */
+			for (w=0; w<8; w=w+1) begin
+				
+					/* set address */
+				tb_a1_addr = w[2:0];
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+					/* store data word */
+				a1_shreg = {tb_a1_data, a1_shreg[255:32]};
+
+			end
+			
+				/* compare */
+			a1_ok =	(a1_shreg == a1);
+
+				/* display results */
+			$display("test_modular_invertor(): %s", a1_ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && a1_ok;
+		
+		end
+		
+	endtask
+	
+	
+endmodule
+
diff --git a/bench/tb_modular_multiplier_256.v b/bench/tb_modular_multiplier_256.v
new file mode 100644
index 0000000..3f62767
--- /dev/null
+++ b/bench/tb_modular_multiplier_256.v
@@ -0,0 +1,366 @@
+//------------------------------------------------------------------------------
+//
+// tb_modular_multiplier_256.v
+// -----------------------------------------------------------------------------
+// Testbench for modular multi-word multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+module tb_modular_multiplier_256;
+
+
+		//
+		// Test Vectors
+		//	
+	localparam	[255:0]	N		= 256'hffffffff00000001000000000000000000000000ffffffffffffffffffffffff;
+	
+	localparam	[255:0]	X_1	= 256'h6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296;
+	localparam	[255:0]	Y_1	= 256'h4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5;
+	localparam	[255:0]	P_1	= 256'h823cd15f6dd3c71933565064513a6b2bd183e554c6a08622f713ebbbface98be;
+	
+	localparam	[255:0]	X_2	= 256'h29d05c193da77b710e86323538b77e1b11f904fea42998be16bd8d744ece7ad0;
+	localparam	[255:0]	Y_2	= 256'hb01cbd1c01e58065711814b583f061e9d431cca994cea1313449bf97c840ae07;
+	localparam	[255:0]	P_2	= 256'h76b2571d1d009ab0e7d1cc086c7d3648f08755b2e2585e780d11f053b06fb6ec;
+	
+	localparam	[255:0]	X_3	= 256'h8101ece47464a6ead70cf69a6e2bd3d88691a3262d22cba4f7635eaff26680a8;
+	localparam	[255:0]	Y_3	= 256'hd8a12ba61d599235f67d9cb4d58f1783d3ca43e78f0a5abaa624079936c0c3a9;
+	localparam	[255:0]	P_3	= 256'h944fea6a4fac7ae475a6bb211db4bbd394bd9b3ee9a038f6c17125a00b3a5375;
+	
+	localparam	[255:0]	X_4	= 256'h7214bc9647160bbd39ff2f80533f5dc6ddd70ddf86bb815661e805d5d4e6f27c;
+	localparam	[255:0]	Y_4	= 256'h8b81e3e977597110c7cf2633435b2294b72642987defd3d4007e1cfc5df84541;
+	localparam	[255:0]	P_4	= 256'h78d3e33c81ab9c652679363c76df004ea6f9a9e3a242a0fb71a4e8fdf41ab519;
+	
+	
+		//
+		// Core Parameters
+		//
+	localparam	WORD_COUNTER_WIDTH	=  3;
+	localparam	OPERAND_NUM_WORDS		=  8;
+	
+
+		//
+		// Clock (100 MHz)
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+
+	
+		//
+		// Inputs, Outputs
+		//
+	reg	rst_n;
+	reg	ena;
+	wire	rdy;
+	
+	
+		//
+		// Buffers (X, Y, N, P)
+		//
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_x_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_y_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_n_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_p_addr;
+	
+	wire										core_p_wren;
+	
+	wire	[                  31:0]	core_x_data;
+	wire	[                  31:0]	core_y_data;
+	wire	[                  31:0]	core_n_data;
+	wire	[                  31:0]	core_p_data;
+	
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_xyn_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_p_addr;
+	
+	reg										tb_xyn_wren;
+	
+	reg	[                  31:0]	tb_x_data;
+	reg	[                  31:0]	tb_y_data;
+	reg	[                  31:0]	tb_n_data;
+	wire	[                  31:0]	tb_p_data;
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_x
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xyn_addr),
+		.a_wr		(tb_xyn_wren),
+		.a_in		(tb_x_data),
+		.a_out	(),
+
+		.b_addr	(core_x_addr),
+		.b_out	(core_x_data)
+	);
+
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_y
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xyn_addr),
+		.a_wr		(tb_xyn_wren),
+		.a_in		(tb_y_data),
+		.a_out	(),
+
+		.b_addr	(core_y_addr),
+		.b_out	(core_y_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_n
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xyn_addr),
+		.a_wr		(tb_xyn_wren),
+		.a_in		(tb_n_data),
+		.a_out	(),
+
+		.b_addr	(core_n_addr),
+		.b_out	(core_n_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_s
+	(
+		.clk		(clk),
+
+		.a_addr	(core_p_addr),
+		.a_wr		(core_p_wren),
+		.a_in		(core_p_data),
+		.a_out	(),
+
+		.b_addr	(tb_p_addr),
+		.b_out	(tb_p_data)
+	);
+	
+	
+		//
+		// UUT
+		//
+	modular_multiplier_256 uut
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(ena),
+		.rdy		(rdy),
+
+		.a_addr	(core_x_addr),
+		.b_addr	(core_y_addr),
+		.n_addr	(core_n_addr),
+		.p_addr	(core_p_addr),
+		.p_wren	(core_p_wren),
+		
+		.a_din	(core_x_data),
+		.b_din	(core_y_data),
+		.n_din	(core_n_data),
+		.p_dout	(core_p_data)
+	);
+
+		
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		
+			/* initialize control inputs */
+		rst_n				= 0;
+		ena				= 0;
+		
+		tb_xyn_wren		= 0;
+		
+			/* wait for some time */
+		#200;
+		
+			/* de-assert reset */
+		rst_n				= 1;
+		
+			/* wait for some time */
+		#100;		
+		
+			/* run tests */
+		test_modular_multiplier(X_1, Y_1, N, P_1);
+		test_modular_multiplier(X_2, Y_2, N, P_2);
+		test_modular_multiplier(X_3, Y_3, N, P_3);
+		test_modular_multiplier(X_4, Y_4, N, P_4);
+		
+			/* print result */
+		if (ok)	$display("tb_modular_multiplier_256: SUCCESS");
+		else		$display("tb_modular_multiplier_256: FAILURE");
+		//
+		//$finish;
+		//
+	end
+	
+	
+		//
+		// Test Task
+		//
+	reg	[255:0]	p;
+	reg				p_ok;
+	
+	integer			w;
+	
+	reg	[511:0]	pp_full;
+	reg	[255:0]	pp_ref;
+	
+	task test_modular_multiplier;
+	
+		input	[255:0] x;
+		input	[255:0] y;
+		input	[255:0] n;
+		input	[255:0] pp;
+		
+		reg	[255:0]	x_shreg;
+		reg	[255:0]	y_shreg;
+		reg	[255:0]	n_shreg;
+		reg	[255:0]	p_shreg;
+	
+		begin
+		
+				/* start filling memories */
+			tb_xyn_wren	= 1;
+			
+				/* initialize shift registers */
+			x_shreg = x;
+			y_shreg = y;
+			n_shreg = n;
+			
+				/* write all the words */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set addresses */
+				tb_xyn_addr	= w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* set data words */
+				tb_x_data	= x_shreg[31:0];
+				tb_y_data	= y_shreg[31:0];
+				tb_n_data	= n_shreg[31:0];
+				
+					/* shift inputs */
+				x_shreg = {{32{1'bX}}, x_shreg[255:32]};
+				y_shreg = {{32{1'bX}}, y_shreg[255:32]};
+				n_shreg = {{32{1'bX}}, n_shreg[255:32]};
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+			end
+			
+				/* wipe addresses */
+			tb_xyn_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+			
+				/* wipe data words */
+			tb_x_data	= {32{1'bX}};
+			tb_y_data	= {32{1'bX}};
+			tb_n_data	= {32{1'bX}};
+			
+				/* stop filling memories */
+			tb_xyn_wren	= 0;
+			
+				/* calculate reference value */
+			pp_full = {{256{1'b0}}, x} * {{256{1'b0}}, y};
+			pp_ref = pp_full % {{256{1'b0}}, n};
+			
+				/* compare reference value against hard-coded one */
+			if (pp_ref != pp) begin
+				$display("ERROR: pp_ref != pp");
+				$finish;
+			end
+			
+				/* start operation */
+			ena = 1;
+			
+				/* clear flag */
+			#10 ena = 0;
+			
+				/* wait for operation to complete */
+			while (!rdy) #10;
+			
+				/* read result */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set address */
+				tb_p_addr	= w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+					/* store data word */
+				p_shreg = {tb_p_data, p_shreg[255:32]};
+
+			end				
+			
+				/* compare */
+			p_ok = (p_shreg == pp);
+
+				/* display results */
+			$display("test_modular_multiplier(): %s", p_ok ? "OK" : "ERROR");
+			
+				/* update flag */
+			ok = ok && p_ok;
+		
+		end
+		
+	endtask
+	
+
+	
+      
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_modular_subtractor.v b/bench/tb_modular_subtractor.v
new file mode 100644
index 0000000..f45a286
--- /dev/null
+++ b/bench/tb_modular_subtractor.v
@@ -0,0 +1,356 @@
+//------------------------------------------------------------------------------
+//
+// tb_modular_subtractor_256.v
+// -----------------------------------------------------------------------------
+// Testbench for modular multi-word subtractor.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+module tb_modular_subtractor_256;
+
+
+		//
+		// Test Vectors
+		//	
+	localparam	[255:0]	N		= 256'hffffffff00000001000000000000000000000000ffffffffffffffffffffffff;
+		
+	localparam	[255:0]	X_1	= 256'h1ddbd0769df27bab1e234019dad09dccce1e87e2193b417ffa1a3465d7439ecd;
+	localparam	[255:0]	Y_1	= 256'h1f67cdc34bac91a072945d212f0a03442fc4855788583ecb7b2e375ad3848210;
+	
+	localparam	[255:0]	X_2	= 256'hff563f653b1392a6fa6b0295a280f7a904a11e22d8ae468e220301d8ac232fcf;
+	localparam	[255:0]	Y_2	= 256'hf6f53c4b57b25453b68e923fb118e4f753d74af01fc58476dd15a80933453899;
+	
+	
+		//
+		// Core Parameters
+		//
+	localparam	WORD_COUNTER_WIDTH	=  3;
+	localparam	OPERAND_NUM_WORDS		=  8;
+	
+
+		//
+		// Clock (100 MHz)
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+
+	
+		//
+		// Inputs, Outputs
+		//
+	reg				rst_n;
+	reg				ena;
+	wire				rdy;
+	
+	
+		//
+		// Buffers (X, Y, N)
+		//
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_xy_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_n_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_d_addr;
+	wire										core_d_wren;
+	
+	wire	[                  31:0]	core_x_data;
+	wire	[                  31:0]	core_y_data;
+	wire	[                  31:0]	core_n_data;
+	wire	[                  31:0]	core_d_data;
+	
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_xyn_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_d_addr;
+	reg										tb_xyn_wren;
+	
+	reg	[                  31:0]	tb_x_data;
+	reg	[                  31:0]	tb_y_data;
+	reg	[                  31:0]	tb_n_data;
+	wire	[                  31:0]	tb_d_data;
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_x
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xyn_addr),
+		.a_wr		(tb_xyn_wren),
+		.a_in		(tb_x_data),
+		.a_out	(),
+
+		.b_addr	(core_xy_addr),
+		.b_out	(core_x_data)
+	);
+
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_y
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xyn_addr),
+		.a_wr		(tb_xyn_wren),
+		.a_in		(tb_y_data),
+		.a_out	(),
+
+		.b_addr	(core_xy_addr),
+		.b_out	(core_y_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_n
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xyn_addr),
+		.a_wr		(tb_xyn_wren),
+		.a_in		(tb_n_data),
+		.a_out	(),
+
+		.b_addr	(core_n_addr),
+		.b_out	(core_n_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_s
+	(
+		.clk		(clk),
+
+		.a_addr	(core_d_addr),
+		.a_wr		(core_d_wren),
+		.a_in		(core_d_data),
+		.a_out	(),
+
+		.b_addr	(tb_d_addr),
+		.b_out	(tb_d_data)
+	);
+	
+	
+		//
+		// UUT
+		//
+	modular_subtractor #
+	(
+		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH),
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS)
+	)
+	uut
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(ena),
+		.rdy			(rdy),
+		
+		.ab_addr		(core_xy_addr),
+		.n_addr		(core_n_addr),
+		.d_addr		(core_d_addr),
+		.d_wren		(core_d_wren),
+		
+		.a_din		(core_x_data),
+		.b_din		(core_y_data),
+		.n_din		(core_n_data),
+		.d_dout		(core_d_data)
+	);
+		
+		
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		
+			/* initialize control inputs */
+		rst_n			= 0;
+		ena			= 0;
+		
+		tb_xyn_wren	= 0;
+		
+			/* wait for some time */
+		#200;
+		
+			/* de-assert reset */
+		rst_n		= 1;
+		
+			/* wait for some time */
+		#100;		
+		
+			/* run tests */
+		test_modular_subtractor(X_1, Y_1, N);
+		test_modular_subtractor(X_2, Y_2, N);
+		test_modular_subtractor(Y_1, X_1, N);
+		test_modular_subtractor(Y_2, X_2, N);
+
+		test_modular_subtractor(X_1, X_2, N);
+		test_modular_subtractor(X_2, X_1, N);
+		test_modular_subtractor(Y_1, Y_2, N);
+		test_modular_subtractor(Y_2, Y_1, N);
+		
+		test_modular_subtractor(X_1, Y_2, N);
+		test_modular_subtractor(Y_2, X_1, N);
+		test_modular_subtractor(X_2, Y_1, N);
+		test_modular_subtractor(Y_1, X_2, N);		
+		
+			/* print result */
+		if (ok)	$display("tb_modular_subtractor_256: SUCCESS");
+		else		$display("tb_modular_subtractor_256: FAILURE");
+		//
+		$finish;
+		//
+	end
+	
+	
+		//
+		// Test Task
+		//	
+	reg	[256:0]	d;
+	wire	[255:0]	d_dummy = d[255:0];
+	reg				d_ok;
+
+	integer			w;
+	
+	reg	[255:0]	x_shreg;
+	reg	[255:0]	y_shreg;
+	reg	[255:0]	n_shreg;
+	reg	[255:0]	d_shreg;
+	
+	task test_modular_subtractor;
+	
+		input	[255:0]	x;
+		input	[255:0]	y;
+		input	[255:0]	n;
+				
+		begin
+		
+				/* start filling memories */
+			tb_xyn_wren	= 1;
+			
+				/* initialize shift registers */
+			x_shreg = x;
+			y_shreg = y;
+			n_shreg = n;
+			
+				/* write all the words */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set addresses */
+				tb_xyn_addr	= w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* set data words */
+				tb_x_data	= x_shreg[31:0];
+				tb_y_data	= y_shreg[31:0];
+				tb_n_data	= n_shreg[31:0];
+				
+					/* shift inputs */
+				x_shreg = {{32{1'bX}}, x_shreg[255:32]};
+				y_shreg = {{32{1'bX}}, y_shreg[255:32]};
+				n_shreg = {{32{1'bX}}, n_shreg[255:32]};
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+			end
+			
+				/* wipe addresses */
+			tb_xyn_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+			
+				/* wipe data words */
+			tb_x_data	= {32{1'bX}};
+			tb_y_data	= {32{1'bX}};
+			tb_n_data	= {32{1'bX}};
+			
+				/* stop filling memories */
+			tb_xyn_wren	= 0;
+						
+				/* calculate reference value */
+			d = {1'b0, (x < y) ? n : {256{1'b0}}};
+			d = d + {1'b0, x} - {1'b0, y};
+			
+				/* start operation */
+			ena = 1;
+			
+				/* clear flag */
+			#10 ena = 0;
+			
+				/* wait for operation to complete */
+			while (!rdy) #10;
+			
+				/* read result */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set address */
+				tb_d_addr	= w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+					/* store data word */
+				d_shreg = {tb_d_data, d_shreg[255:32]};
+
+			end				
+			
+				/* compare */
+			d_ok = (d_shreg == d[255:0]);
+
+				/* display results */
+			$display("test_modular_subtractor(): %s", d_ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && d_ok;
+		
+		end
+		
+	endtask
+	
+      
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_mw_comparator.v b/bench/tb_mw_comparator.v
new file mode 100644
index 0000000..abab8bc
--- /dev/null
+++ b/bench/tb_mw_comparator.v
@@ -0,0 +1,322 @@
+//------------------------------------------------------------------------------
+//
+// tb_mw_comparator.v
+// -----------------------------------------------------------------------------
+// Testbench for multi-word comparator.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+module tb_mw_comparator;
+
+
+		//
+		// Test Vectors
+		//	
+	localparam	[383:0]	A_0	= 384'hBDC7B53C_616B13B5_77622510_75BA95FC_475D568B_79E730D4_18A9143C_18905F76;
+	
+	localparam	[383:0]	A_1	= 384'hBDC7B53C_616B13B5_77622510_75BA95FC_475D568B_79E730D4_18A9143C_18905F75;
+	localparam	[383:0]	A_2	= 384'hBDC7B53C_616B13B5_77622510_75BA95FC_475D568B_79E730D4_18A9143C_18905F77;
+	
+	localparam	[383:0]	A_3	= 384'hBDC7B53C_616B13B5_77622510_75BA95FC_375D568B_79E730D4_18A9143C_18905F76;
+	localparam	[383:0]	A_4	= 384'hBDC7B53C_616B13B5_77622510_75BA95FC_575D568B_79E730D4_18A9143C_18905F76;
+	
+	localparam	[383:0]	A_5	= 384'hBDC7B53C_616B13B5_77622510_75BA95FB_475D568B_79E730D4_18A9143C_18905F76;
+	localparam	[383:0]	A_6	= 384'hBDC7B53C_616B13B5_77622510_75BA95FD_475D568B_79E730D4_18A9143C_18905F76;
+	
+	localparam	[383:0]	A_7	= 384'hADC7B53C_616B13B5_77622510_75BA95FC_475D568B_79E730D4_18A9143C_18905F76;
+	localparam	[383:0]	A_8	= 384'hCDC7B53C_616B13B5_77622510_75BA95FC_475D568B_79E730D4_18A9143C_18905F76;
+		
+	localparam	[383:0]	B_0	= 384'h348A6D1F_7C66D21E_8D1490D9_AA6AE3C0_AD784F98_850046D4_10DDD64D_F6BB32E5;
+	
+	localparam	[383:0]	B_1	= 384'h348A6D1F_7C66D21E_8D1490D9_AA6AE3C0_AD784F98_850046D4_10DDD64D_F6BB32E4;
+	localparam	[383:0]	B_2	= 384'h348A6D1F_7C66D21E_8D1490D9_AA6AE3C0_AD784F98_850046D4_10DDD64D_F6BB32E6;
+	
+	localparam	[383:0]	B_3	= 384'h348A6D1F_7C66D21E_8D1490D9_AA6AE3C0_9D784F98_850046D4_10DDD64D_F6BB32E5;
+	localparam	[383:0]	B_4	= 384'h348A6D1F_7C66D21E_8D1490D9_AA6AE3C0_BD784F98_850046D4_10DDD64D_F6BB32E5;
+	
+	localparam	[383:0]	B_5	= 384'h348A6D1F_7C66D21E_8D1490D9_AA6AE3BF_AD784F98_850046D4_10DDD64D_F6BB32E5;
+	localparam	[383:0]	B_6	= 384'h348A6D1F_7C66D21E_8D1490D9_AA6AE3C1_AD784F98_850046D4_10DDD64D_F6BB32E5;
+	
+	localparam	[383:0]	B_7	= 384'h248A6D1F_7C66D21E_8D1490D9_AA6AE3C0_AD784F98_850046D4_10DDD64D_F6BB32E5;
+	localparam	[383:0]	B_8	= 384'h448A6D1F_7C66D21E_8D1490D9_AA6AE3C0_AD784F98_850046D4_10DDD64D_F6BB32E5;
+	
+	
+		//
+		// Core Parameters
+		//
+	localparam	WORD_COUNTER_WIDTH	=  3;
+	parameter	OPERAND_NUM_WORDS		=  8;
+	
+
+		//
+		// Clock (100 MHz)
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+
+	
+		//
+		// Inputs, Outputs
+		//
+	reg	rst_n;
+	reg	ena;
+	wire	rdy;
+	
+	wire	core_cmp_l;
+	wire	core_cmp_e;
+	wire	core_cmp_g;
+
+
+		//
+		// Buffers (X, Y)
+		//
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_xy_addr;
+	
+	wire	[                32-1:0]	core_x_data;
+	wire	[                32-1:0]	core_y_data;
+	
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_xy_addr;
+	reg										tb_xy_wren;
+	
+	reg	[                32-1:0]	tb_x_data;
+	reg	[                32-1:0]	tb_y_data;
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_x
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xy_addr),
+		.a_wr		(tb_xy_wren),
+		.a_in		(tb_x_data),
+		.a_out	(),
+
+		.b_addr	(core_xy_addr),
+		.b_out	(core_x_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_y
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_xy_addr),
+		.a_wr		(tb_xy_wren),
+		.a_in		(tb_y_data),
+		.a_out	(),
+
+		.b_addr	(core_xy_addr),
+		.b_out	(core_y_data)
+	);
+	
+	
+		//
+		// UUT
+		//
+	mw_comparator #
+	(
+		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH),
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS)
+	)
+	uut
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(ena),
+		.rdy		(rdy),
+		
+		.xy_addr	(core_xy_addr),
+		.x_din	(core_x_data),
+		.y_din	(core_y_data),
+		
+		.cmp_l	(core_cmp_l),
+		.cmp_e	(core_cmp_e),
+		.cmp_g	(core_cmp_g)
+	);
+		
+		
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		
+			/* initialize control inputs */
+		rst_n		= 0;
+		ena		= 0;
+		
+		tb_xy_wren	= 0;
+		
+			/* wait for some time */
+		#200;
+		
+			/* de-assert reset */
+		rst_n		= 1;
+		
+			/* wait for some time */
+		#100;		
+		
+			/* run tests */
+		test_mw_comparator(A_0, A_0);
+		
+		test_mw_comparator(A_0, A_1);
+		test_mw_comparator(A_0, A_2);
+		test_mw_comparator(A_0, A_3);
+		test_mw_comparator(A_0, A_4);
+		test_mw_comparator(A_0, A_5);
+		test_mw_comparator(A_0, A_6);
+		test_mw_comparator(A_0, A_7);
+		test_mw_comparator(A_0, A_8);
+		
+		test_mw_comparator(B_0, B_0);
+		
+		test_mw_comparator(B_0, B_1);
+		test_mw_comparator(B_0, B_2);
+		test_mw_comparator(B_0, B_3);
+		test_mw_comparator(B_0, B_4);
+		test_mw_comparator(B_0, B_5);
+		test_mw_comparator(B_0, B_6);
+		test_mw_comparator(B_0, B_7);
+		test_mw_comparator(B_0, B_8);		
+		
+			/* print result */
+		if (ok)	$display("tb_mw_comparator: SUCCESS");
+		else		$display("tb_mw_comparator: FAILURE");
+		//
+		$finish;
+		//
+	end
+	
+	
+		//
+		// Test Task
+		//	
+	reg	cmp_l;
+	reg	cmp_e;
+	reg	cmp_g;
+	reg	cmp_ok;
+
+	integer w;
+
+	task test_mw_comparator;
+	
+		input	[255:0]	x;
+		input	[255:0]	y;
+		
+		reg	[255:0]	x_shreg;
+		reg	[255:0]	y_shreg;
+		
+		begin
+		
+				/* start filling memories */
+			tb_xy_wren	= 1;
+			
+				/* initialize shift registers */
+			x_shreg = x;
+			y_shreg = y;
+			
+				/* write all the words */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set addresses */
+				tb_xy_addr	= w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* set data words */
+				tb_x_data	= x_shreg[31:0];
+				tb_y_data	= y_shreg[31:0];
+				
+					/* shift inputs */
+				x_shreg = {{32{1'bX}}, x_shreg[255:32]};
+				y_shreg = {{32{1'bX}}, y_shreg[255:32]};
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+			end
+			
+				/* wipe addresses */
+			tb_xy_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+			
+				/* wipe data words */
+			tb_x_data	= {32{1'bX}};
+			tb_y_data	= {32{1'bX}};
+			
+				/* stop filling memories */
+			tb_xy_wren	= 0;
+						
+				/* calculate reference values */
+			cmp_l = (x <  y) ? 1 : 0;
+			cmp_e = (x == y) ? 1 : 0;
+			cmp_g = (x >  y) ? 1 : 0;
+			
+				/* start operation */
+			ena = 1;
+			
+				/* clear flag */
+			#10 ena = 0;
+			
+				/* wait for operation to complete */
+			while (!rdy) #10;
+			
+				/* compare */
+			cmp_ok = (cmp_l == core_cmp_l) && (cmp_e == core_cmp_e) && (cmp_g == core_cmp_g);
+
+				/* display results */
+			$display("test_mw_comparator(): %s", cmp_ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && cmp_ok;
+		
+		end
+		
+	endtask
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/bench/tb_mw_mover.v b/bench/tb_mw_mover.v
new file mode 100644
index 0000000..be767fc
--- /dev/null
+++ b/bench/tb_mw_mover.v
@@ -0,0 +1,282 @@
+//------------------------------------------------------------------------------
+//
+// tb_modular_mover.v
+// -----------------------------------------------------------------------------
+// Testbench for multi-word data mover.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+`timescale 1ns / 1ps
+//------------------------------------------------------------------------------
+
+module tb_mw_mover;
+
+
+		//
+		// Test Vectors
+		//	
+	localparam	[255:0]	X_1	= 256'h1ddbd0769df27bab1e234019dad09dccce1e87e2193b417ffa1a3465d7439ecd;
+	localparam	[255:0]	X_2	= 256'h1f67cdc34bac91a072945d212f0a03442fc4855788583ecb7b2e375ad3848210;
+	localparam	[255:0]	X_3	= 256'hff563f653b1392a6fa6b0295a280f7a904a11e22d8ae468e220301d8ac232fcf;
+	localparam	[255:0]	X_4	= 256'hf6f53c4b57b25453b68e923fb118e4f753d74af01fc58476dd15a80933453899;
+	
+	
+		//
+		// Core Parameters
+		//
+	localparam	WORD_COUNTER_WIDTH	=  3;
+	localparam	OPERAND_NUM_WORDS		=  8;
+	
+
+		//
+		// Clock (100 MHz)
+		//
+	reg clk = 1'b0;
+	always #5 clk = ~clk;
+
+	
+		//
+		// Inputs, Outputs
+		//
+	reg				rst_n;
+	reg				ena;
+	wire				rdy;
+	
+	
+		//
+		// Buffers (X, Y)
+		//
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_x_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	core_y_addr;
+	wire										core_y_wren;
+	
+	wire	[                32-1:0]	core_x_data;
+	wire	[                32-1:0]	core_y_data;
+	
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_x_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	tb_y_addr;
+	reg										tb_x_wren;
+	
+	reg	[                32-1:0]	tb_x_data;
+	wire	[                32-1:0]	tb_y_data;
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_x
+	(
+		.clk		(clk),
+
+		.a_addr	(tb_x_addr),
+		.a_wr		(tb_x_wren),
+		.a_in		(tb_x_data),
+		.a_out	(),
+
+		.b_addr	(core_x_addr),
+		.b_out	(core_x_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH)
+	)
+	bram_d
+	(
+		.clk		(clk),
+
+		.a_addr	(core_y_addr),
+		.a_wr		(core_y_wren),
+		.a_in		(core_y_data),
+		.a_out	(),
+
+		.b_addr	(tb_y_addr),
+		.b_out	(tb_y_data)
+	);
+	
+	
+		//
+		// UUT
+		//
+	mw_mover #
+	(
+		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH),
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS)
+	)
+	uut
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(ena),
+		.rdy			(rdy),
+		
+		.x_addr		(core_x_addr),
+		.y_addr		(core_y_addr),
+		.y_wren		(core_y_wren),
+		
+		.x_din		(core_x_data),
+		.y_dout		(core_y_data)
+	);
+		
+		
+		//
+		// Testbench Routine
+		//
+	reg ok = 1;
+	initial begin
+		
+			/* initialize control inputs */
+		rst_n			= 0;
+		ena			= 0;
+		
+		tb_x_wren	= 0;
+		
+			/* wait for some time */
+		#200;
+		
+			/* de-assert reset */
+		rst_n		= 1;
+		
+			/* wait for some time */
+		#100;		
+		
+			/* run tests */
+		test_modular_mover(X_1);
+		test_modular_mover(X_2);
+		test_modular_mover(X_3);
+		test_modular_mover(X_4);
+		
+			/* print result */
+		if (ok)	$display("tb_modular_mover: SUCCESS");
+		else		$display("tb_modular_mover: FAILURE");
+		//
+		$finish;
+		//
+	end
+	
+	
+		//
+		// Test Task
+		//	
+	reg	[255:0]	y;
+	reg				y_ok;
+
+	integer			w;
+	
+	reg	[255:0]	x_shreg;
+	reg	[255:0]	y_shreg;
+	
+	task test_modular_mover;
+	
+		input	[255:0]	x;
+				
+		begin
+		
+				/* start filling memories */
+			tb_x_wren	= 1;
+			
+				/* initialize shift registers */
+			x_shreg = x;
+			
+				/* write all the words */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set addresses */
+				tb_x_addr	= w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* set data words */
+				tb_x_data	= x_shreg[31:0];
+				
+					/* shift inputs */
+				x_shreg = {{32{1'bX}}, x_shreg[255:32]};
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+			end
+			
+				/* wipe addresses */
+			tb_x_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+			
+				/* wipe data words */
+			tb_x_data	= {32{1'bX}};
+			
+				/* stop filling memories */
+			tb_x_wren	= 0;
+			
+				/* start operation */
+			ena = 1;
+			
+				/* clear flag */
+			#10 ena = 0;
+			
+				/* wait for operation to complete */
+			while (!rdy) #10;
+			
+				/* read result */
+			for (w=0; w<OPERAND_NUM_WORDS; w=w+1) begin
+				
+					/* set address */
+				tb_y_addr	= w[WORD_COUNTER_WIDTH-1:0];
+				
+					/* wait for 1 clock tick */
+				#10;
+				
+					/* store data word */
+				y_shreg = {tb_y_data, y_shreg[255:32]};
+
+			end				
+			
+				/* compare */
+			y_ok = (y_shreg == x);
+
+				/* display results */
+			$display("test_modular_mover(): %s", y_ok ? "OK" : "ERROR");
+			
+				/* update global flag */
+			ok = ok && y_ok;
+		
+		end
+		
+	endtask
+	
+      
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/curve/curve_dbl_add_256.v b/rtl/curve/curve_dbl_add_256.v
new file mode 100644
index 0000000..08a9931
--- /dev/null
+++ b/rtl/curve/curve_dbl_add_256.v
@@ -0,0 +1,868 @@
+//------------------------------------------------------------------------------
+//
+// curve_adder_256.v
+// -----------------------------------------------------------------------------
+// Elliptic curve point adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module curve_dbl_add_256
+	(
+		clk, rst_n,
+		ena, rdy,
+		uop_addr, uop,
+		px_addr, py_addr, pz_addr, rx_addr, ry_addr, rz_addr, q_addr, v_addr,
+		rx_wren, ry_wren, rz_wren,
+		px_din, py_din, pz_din,
+		rx_din, ry_din, rz_din,
+		rx_dout, ry_dout, rz_dout, q_din, v_din
+	);
+
+
+		//
+		// Microcode
+		//
+`include "uop_ecdsa.v"
+
+
+		//
+		// Constants
+		//
+	localparam	WORD_COUNTER_WIDTH	= 3;	// 0 .. 7
+	localparam	OPERAND_NUM_WORDS		= 8;	// 8 * 32 = 256
+
+	
+		//
+		// Ports
+		//
+	input		wire	clk;		// system clock
+	input		wire	rst_n;	// active-low async reset
+	
+	input		wire	ena;		// enable input
+	output	wire	rdy;		// ready output
+		
+	output	reg	[ 6-1: 0]	uop_addr;
+	input		wire	[20-1: 0]	uop;
+		
+	output	reg	[WORD_COUNTER_WIDTH-1:0]	px_addr;
+	output	reg	[WORD_COUNTER_WIDTH-1:0]	py_addr;
+	output	reg	[WORD_COUNTER_WIDTH-1:0]	pz_addr;
+	output	reg	[WORD_COUNTER_WIDTH-1:0]	rx_addr;
+	output	reg	[WORD_COUNTER_WIDTH-1:0]	ry_addr;
+	output	reg	[WORD_COUNTER_WIDTH-1:0]	rz_addr;
+	output	reg	[WORD_COUNTER_WIDTH-1:0]	v_addr;
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	q_addr;
+	
+	output	wire										rx_wren;
+	output	wire										ry_wren;
+	output	wire										rz_wren;
+
+	input		wire	[                32-1:0]	px_din;
+	input		wire	[                32-1:0]	py_din;
+	input		wire	[                32-1:0]	pz_din;
+	input		wire	[                32-1:0]	rx_din;
+	input		wire	[                32-1:0]	ry_din;
+	input		wire	[                32-1:0]	rz_din;
+	output	wire	[                32-1:0]	rx_dout;
+	output	wire	[                32-1:0]	ry_dout;
+	output	wire	[                32-1:0]	rz_dout;
+	input		wire	[                32-1:0]	q_din;
+	input		wire	[                32-1:0]	v_din;
+
+	
+		//
+		// Microcode
+		//   
+	wire	[ 4: 0]	uop_opcode	= uop[19:15];
+	wire	[ 4: 0]	uop_src_a	= uop[14:10];
+	wire	[ 4: 0]	uop_src_b	= uop[ 9: 5];
+	wire	[ 2: 0]	uop_dst		= uop[ 4: 2];
+	wire	[ 1: 0]	uop_exec		= uop[ 1: 0];
+	
+
+		//
+		// Multi-Word Comparator
+		//
+	wire	mw_cmp_ena;
+	wire	mw_cmp_rdy;
+		
+	wire	mw_cmp_out_l;
+	wire	mw_cmp_out_e;
+	wire	mw_cmp_out_g;
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	mw_cmp_addr_xy;
+	
+	wire	[                32-1:0]	mw_cmp_din_x;
+	wire	[                32-1:0]	mw_cmp_din_y;
+
+		// flags
+	reg	flag_pz_is_zero;
+	reg	flag_t1_is_zero;
+	reg	flag_t2_is_zero;
+	
+	mw_comparator #
+	(
+		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH),
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS)
+	)
+	mw_comparator_inst
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(mw_cmp_ena),
+		.rdy			(mw_cmp_rdy),
+		
+		.xy_addr		(mw_cmp_addr_xy),
+		.x_din		(mw_cmp_din_x),
+		.y_din		(mw_cmp_din_y),
+		
+		.cmp_l		(mw_cmp_out_l),
+		.cmp_e		(mw_cmp_out_e),
+		.cmp_g		(mw_cmp_out_g)
+	);
+	
+	
+		//
+		// Modular Adder
+		//
+	wire	mod_add_ena;
+	wire	mod_add_rdy;
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_add_addr_ab;
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_add_addr_n;
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_add_addr_s;
+	wire										mod_add_wren_s;
+	
+	wire	[                32-1:0]	mod_add_din_a;
+	wire	[                32-1:0]	mod_add_din_b;
+	wire	[                32-1:0]	mod_add_din_n;
+	wire	[                32-1:0]	mod_add_dout_s;
+	
+	assign mod_add_din_n = q_din;
+	
+	modular_adder #
+	(
+		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH),
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS)
+	)
+	modular_adder_inst
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(mod_add_ena),
+		.rdy			(mod_add_rdy),
+		
+		.ab_addr		(mod_add_addr_ab),
+		.n_addr		(mod_add_addr_n),
+		.s_addr		(mod_add_addr_s),
+		.s_wren		(mod_add_wren_s),
+		
+		.a_din		(mod_add_din_a),
+		.b_din		(mod_add_din_b),
+		.n_din		(mod_add_din_n),
+		.s_dout		(mod_add_dout_s)
+	);
+	
+	
+		//
+		// Modular Subtractor
+		//
+	wire	mod_sub_ena;
+	wire	mod_sub_rdy;
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_sub_addr_ab;
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_sub_addr_n;
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_sub_addr_d;
+	wire										mod_sub_wren_d;
+	
+	wire	[                32-1:0]	mod_sub_din_a;
+	wire	[                32-1:0]	mod_sub_din_b;
+	wire	[                32-1:0]	mod_sub_din_n;
+	wire	[                32-1:0]	mod_sub_dout_d;
+	
+	assign mod_sub_din_n = q_din;
+	
+	modular_subtractor #
+	(
+		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH),
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS)
+	)
+	modular_subtractor_inst
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(mod_sub_ena),
+		.rdy			(mod_sub_rdy),
+		
+		.ab_addr		(mod_sub_addr_ab),
+		.n_addr		(mod_sub_addr_n),
+		.d_addr		(mod_sub_addr_d),
+		.d_wren		(mod_sub_wren_d),
+		
+		.a_din		(mod_sub_din_a),
+		.b_din		(mod_sub_din_b),
+		.n_din		(mod_sub_din_n),
+		.d_dout		(mod_sub_dout_d)
+	);
+	
+	
+		//
+		// Modular Multiplier
+		//
+	wire	mod_mul_ena;
+	wire	mod_mul_rdy;
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_mul_addr_a;
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_mul_addr_b;
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_mul_addr_n;
+	wire	[WORD_COUNTER_WIDTH-1:0]	mod_mul_addr_p;
+	wire										mod_mul_wren_p;
+	
+	wire	[                32-1:0]	mod_mul_din_a;
+	wire	[                32-1:0]	mod_mul_din_b;
+	wire	[                32-1:0]	mod_mul_din_n;
+	wire	[                32-1:0]	mod_mul_dout_p;
+	
+	assign mod_mul_din_n = q_din;
+	
+	modular_multiplier_256 modular_multiplier_inst
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(mod_mul_ena),
+		.rdy		(mod_mul_rdy),
+		
+		.a_addr	(mod_mul_addr_a),
+		.b_addr	(mod_mul_addr_b),
+		.n_addr	(mod_mul_addr_n),
+		.p_addr	(mod_mul_addr_p),
+		.p_wren	(mod_mul_wren_p),
+		
+		.a_din	(mod_mul_din_a),
+		.b_din	(mod_mul_din_b),
+		.n_din	(mod_mul_din_n),
+		.p_dout	(mod_mul_dout_p)
+	);
+	
+	
+		//
+		// Multi-Word Data Mover
+		//
+	wire	mw_mov_ena;
+	wire	mw_mov_rdy;
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	mw_mov_addr_x;
+	wire	[WORD_COUNTER_WIDTH-1:0]	mw_mov_addr_y;
+	wire										mw_mov_wren_y;
+	
+	wire	[                32-1:0]	mw_mov_din_x;
+	wire	[                32-1:0]	mw_mov_dout_y;
+	
+	mw_mover mw_mover_inst
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(mw_mov_ena),
+		.rdy		(mw_mov_rdy),
+		
+		.x_addr	(mw_mov_addr_x),
+		.y_addr	(mw_mov_addr_y),
+		.y_wren	(mw_mov_wren_y),
+		
+		.x_din	(mw_mov_din_x),
+		.y_dout	(mw_mov_dout_y)
+	);
+	
+	
+		//
+		// ROMs
+		//
+	reg	[WORD_COUNTER_WIDTH-1:0]	brom_one_addr;
+	//reg	[WORD_COUNTER_WIDTH-1:0]	brom_zero_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	brom_delta_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	brom_g_x_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	brom_g_y_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	brom_h_x_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	brom_h_y_addr;
+
+	wire	[                32-1:0]	brom_one_dout;
+	wire	[                32-1:0]	brom_zero_dout;
+	wire	[                32-1:0]	brom_delta_dout;
+	wire	[                32-1:0]	brom_g_x_dout;
+	wire	[                32-1:0]	brom_g_y_dout;
+	wire	[                32-1:0]	brom_h_x_dout;
+	wire	[                32-1:0]	brom_h_y_dout;
+	
+	(* ROM_STYLE="BLOCK" *) brom_p256_one brom_one_inst
+		(.clk(clk), .b_addr(brom_one_addr), .b_out(brom_one_dout));
+		
+	brom_p256_zero brom_zero_inst
+		(.b_out(brom_zero_dout));
+		
+	(* ROM_STYLE="BLOCK" *) brom_p256_delta brom_delta_inst
+		(.clk(clk), .b_addr(brom_delta_addr), .b_out(brom_delta_dout));
+	
+	(* ROM_STYLE="BLOCK" *) brom_p256_g_x brom_g_x_inst
+		(.clk(clk), .b_addr(brom_g_x_addr), .b_out(brom_g_x_dout));
+
+	(* ROM_STYLE="BLOCK" *) brom_p256_g_y brom_g_y_inst
+		(.clk(clk), .b_addr(brom_g_y_addr), .b_out(brom_g_y_dout));
+		
+	(* ROM_STYLE="BLOCK" *) brom_p256_h_x brom_h_x_inst
+		(.clk(clk), .b_addr(brom_h_x_addr), .b_out(brom_h_x_dout));
+
+	(* ROM_STYLE="BLOCK" *) brom_p256_h_y brom_h_y_inst
+		(.clk(clk), .b_addr(brom_h_y_addr), .b_out(brom_h_y_dout));
+
+	
+		//
+		// Temporary Variables
+		//
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_t1_wr_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_t2_wr_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_t3_wr_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_t4_wr_addr;
+	
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_t1_rd_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_t2_rd_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_t3_rd_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_t4_rd_addr;
+	
+	wire										bram_t1_wr_en;
+	wire										bram_t2_wr_en;
+	wire										bram_t3_wr_en;
+	wire										bram_t4_wr_en;
+	
+	wire	[                32-1:0]	bram_t1_wr_data;
+	wire	[                32-1:0]	bram_t2_wr_data;
+	wire	[                32-1:0]	bram_t3_wr_data;
+	wire	[                32-1:0]	bram_t4_wr_data;
+	
+	wire	[                32-1:0]	bram_t1_rd_data;
+	wire	[                32-1:0]	bram_t2_rd_data;
+	wire	[                32-1:0]	bram_t3_rd_data;
+	wire	[                32-1:0]	bram_t4_rd_data;
+		
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH)
+	)
+	bram_t1
+	(	.clk		(clk),
+		.a_addr(bram_t1_wr_addr), .a_wr(bram_t1_wr_en), .a_in(bram_t1_wr_data), .a_out(),
+		.b_addr(bram_t1_rd_addr),                                               .b_out(bram_t1_rd_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH)
+	)
+	bram_t2
+	(	.clk		(clk),
+		.a_addr(bram_t2_wr_addr), .a_wr(bram_t2_wr_en), .a_in(bram_t2_wr_data), .a_out(),
+		.b_addr(bram_t2_rd_addr),                                               .b_out(bram_t2_rd_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH)
+	)
+	bram_t3
+	(	.clk		(clk),
+		.a_addr(bram_t3_wr_addr), .a_wr(bram_t3_wr_en), .a_in(bram_t3_wr_data), .a_out(),
+		.b_addr(bram_t3_rd_addr),                                               .b_out(bram_t3_rd_data)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH)
+	)
+	bram_t4
+	(	.clk		(clk),
+		.a_addr(bram_t4_wr_addr), .a_wr(bram_t4_wr_en), .a_in(bram_t4_wr_data), .a_out(),
+		.b_addr(bram_t4_rd_addr),                                               .b_out(bram_t4_rd_data)
+	);
+	
+
+		//
+		// uOP Trigger Logic
+		//
+	reg	uop_trig;
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	uop_trig <= 1'b0;
+		else						uop_trig <= (fsm_state == FSM_STATE_FETCH) ? 1'b1 : 1'b0;
+		
+
+		//
+		// FSM
+		//
+	localparam	[ 1: 0]	FSM_STATE_STALL	= 2'b00;
+	localparam	[ 1: 0]	FSM_STATE_FETCH	= 2'b01;
+	localparam	[ 1: 0]	FSM_STATE_EXECUTE	= 2'b10;
+	
+	reg	[ 1: 0]	fsm_state		= FSM_STATE_STALL;
+	wire	[ 1: 0]	fsm_state_next	= (uop_opcode == OPCODE_RDY) ? FSM_STATE_STALL : FSM_STATE_FETCH;
+	
+	
+		//
+		// FSM Transition Logic
+		//
+	reg	uop_done;
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)		fsm_state <= FSM_STATE_STALL;
+		else case (fsm_state)
+			FSM_STATE_STALL:		fsm_state <= ena ? FSM_STATE_FETCH : FSM_STATE_STALL;
+			FSM_STATE_FETCH:		fsm_state <= FSM_STATE_EXECUTE;
+			FSM_STATE_EXECUTE:	fsm_state <= (!uop_trig && uop_done) ? fsm_state_next : FSM_STATE_EXECUTE;
+			default:					fsm_state <= FSM_STATE_STALL;
+		endcase
+	
+	
+		//
+		// uOP Address Increment Logic
+		//
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_STALL)
+			uop_addr <= 5'd0;
+		else if (fsm_state == FSM_STATE_EXECUTE)
+			if (!uop_trig && uop_done)
+				uop_addr <= (uop_opcode == OPCODE_RDY) ? 5'd0 : uop_addr + 1'b1;
+		
+	
+		//
+		// uOP Completion Logic
+		//
+	always @(*)
+		//
+		case (uop_opcode)
+			OPCODE_CMP:	uop_done = mw_cmp_rdy;
+			OPCODE_MOV:	uop_done = mw_mov_rdy;
+			OPCODE_ADD:	uop_done = mod_add_rdy;
+			OPCODE_SUB:	uop_done = mod_sub_rdy;
+			OPCODE_MUL:	uop_done = mod_mul_rdy;
+			OPCODE_RDY:	uop_done = 1'b1;
+			default:		uop_done = 1'b0;
+		endcase
+	
+
+		//
+		// Helper Modules Enable Logic
+		//
+	assign mw_cmp_ena		= uop_opcode[0] & uop_trig;
+	assign mw_mov_ena		= uop_opcode[1] & uop_trig;
+	assign mod_add_ena	= uop_opcode[2] & uop_trig;
+	assign mod_sub_ena	= uop_opcode[3] & uop_trig;
+	assign mod_mul_ena	= uop_opcode[4] & uop_trig;
+	
+	
+		//
+		// uOP Source Value Decoding Logic
+		//
+	reg	[31: 0]	uop_src_a_value;
+	
+	always @(*)
+		//
+		case (uop_src_a)
+			UOP_SRC_PX:		uop_src_a_value = px_din;
+			UOP_SRC_PY:		uop_src_a_value = py_din;
+			UOP_SRC_PZ:		uop_src_a_value = pz_din;
+			
+			UOP_SRC_RX:		uop_src_a_value = rx_din;
+			UOP_SRC_RY:		uop_src_a_value = ry_din;
+			UOP_SRC_RZ:		uop_src_a_value = rz_din;
+			
+			UOP_SRC_T1:		uop_src_a_value = bram_t1_rd_data;
+			UOP_SRC_T2:		uop_src_a_value = bram_t2_rd_data;
+			UOP_SRC_T3:		uop_src_a_value = bram_t3_rd_data;
+			UOP_SRC_T4:		uop_src_a_value = bram_t4_rd_data;
+			
+			UOP_SRC_ONE:	uop_src_a_value = brom_one_dout;
+			UOP_SRC_ZERO:	uop_src_a_value = brom_zero_dout;
+			UOP_SRC_DELTA:	uop_src_a_value = brom_delta_dout;
+			
+			UOP_SRC_G_X:	uop_src_a_value = brom_g_x_dout;
+			UOP_SRC_G_Y:	uop_src_a_value = brom_g_y_dout;
+
+			UOP_SRC_H_X:	uop_src_a_value = brom_h_x_dout;
+			UOP_SRC_H_Y:	uop_src_a_value = brom_h_y_dout;
+			
+			UOP_SRC_V:		uop_src_a_value = v_din;
+			
+			default:			uop_src_a_value = {32{1'bX}};
+		endcase
+
+		
+	assign mw_cmp_din_x  = uop_src_a_value;
+	assign mw_mov_din_x  = uop_src_a_value;
+	assign mod_add_din_a = uop_src_a_value;
+	assign mod_sub_din_a = uop_src_a_value;
+	assign mod_mul_din_a = uop_src_a_value;
+	
+	reg	[31: 0]	uop_src_b_value;
+		
+	always @(*)
+		//
+		case (uop_src_b)
+			UOP_SRC_PX:		uop_src_b_value = px_din;
+			UOP_SRC_PY:		uop_src_b_value = py_din;
+			UOP_SRC_PZ:		uop_src_b_value = pz_din;
+			
+			UOP_SRC_RX:		uop_src_b_value = rx_din;
+			UOP_SRC_RY:		uop_src_b_value = ry_din;
+			UOP_SRC_RZ:		uop_src_b_value = rz_din;
+			
+			UOP_SRC_T1:		uop_src_b_value = bram_t1_rd_data;
+			UOP_SRC_T2:		uop_src_b_value = bram_t2_rd_data;
+			UOP_SRC_T3:		uop_src_b_value = bram_t3_rd_data;
+			UOP_SRC_T4:		uop_src_b_value = bram_t4_rd_data;
+			
+			UOP_SRC_ONE:	uop_src_b_value = brom_one_dout;
+			UOP_SRC_ZERO:	uop_src_b_value = brom_zero_dout;
+			UOP_SRC_DELTA:	uop_src_b_value = brom_delta_dout;
+
+			UOP_SRC_G_X:	uop_src_b_value = brom_g_x_dout;
+			UOP_SRC_G_Y:	uop_src_b_value = brom_g_y_dout;
+
+			UOP_SRC_H_X:	uop_src_b_value = brom_h_x_dout;
+			UOP_SRC_H_Y:	uop_src_b_value = brom_h_y_dout;
+			
+			UOP_SRC_V:		uop_src_b_value = v_din;
+			
+			default:			uop_src_b_value = {32{1'bX}};
+		endcase
+	
+	assign mw_cmp_din_y  = uop_src_b_value;
+	assign mod_add_din_b = uop_src_b_value;
+	assign mod_sub_din_b = uop_src_b_value;
+	assign mod_mul_din_b = uop_src_b_value;
+	
+	
+		//
+		// uOP Source & Destination Address Decoding Logic
+		//
+	reg	[WORD_COUNTER_WIDTH-1:0]	uop_src_a_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	uop_src_b_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	uop_dst_addr;
+	reg	[WORD_COUNTER_WIDTH-1:0]	uop_q_addr;
+	
+	assign q_addr = uop_q_addr;
+	
+	always @(*)
+		//
+		case (uop_opcode)
+			//
+			OPCODE_CMP:	begin
+				uop_src_a_addr = mw_cmp_addr_xy;
+				uop_src_b_addr = mw_cmp_addr_xy;
+				uop_dst_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+				uop_q_addr		= {WORD_COUNTER_WIDTH{1'bX}};
+			end
+			//
+			OPCODE_MOV:	begin
+				uop_src_a_addr = mw_mov_addr_x;
+				uop_src_b_addr = {WORD_COUNTER_WIDTH{1'bX}};
+				uop_dst_addr	= mw_mov_addr_y;
+				uop_q_addr		= {WORD_COUNTER_WIDTH{1'bX}};
+			end
+			//
+			OPCODE_ADD:	begin
+				uop_src_a_addr = mod_add_addr_ab;
+				uop_src_b_addr = mod_add_addr_ab;
+				uop_dst_addr	= mod_add_addr_s;
+				uop_q_addr		= mod_add_addr_n;
+			end
+			//
+			OPCODE_SUB:	begin
+				uop_src_a_addr = mod_sub_addr_ab;
+				uop_src_b_addr = mod_sub_addr_ab;
+				uop_dst_addr	= mod_sub_addr_d;
+				uop_q_addr		= mod_sub_addr_n;
+			end
+			//
+			OPCODE_MUL:	begin
+				uop_src_a_addr = mod_mul_addr_a;
+				uop_src_b_addr = mod_mul_addr_b;
+				uop_dst_addr	= mod_mul_addr_p;
+				uop_q_addr		= mod_mul_addr_n;
+			end
+			//
+			default: begin
+				uop_src_a_addr = {WORD_COUNTER_WIDTH{1'bX}};
+				uop_src_b_addr = {WORD_COUNTER_WIDTH{1'bX}};
+				uop_dst_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+				uop_q_addr		= {WORD_COUNTER_WIDTH{1'bX}};
+			end
+			//
+		endcase
+	
+	
+		//
+		// uOP Conditional Execution Logic
+		//
+	reg	uop_exec_effective;
+
+	always @(*)
+		//
+		case (uop_exec)
+			UOP_EXEC_ALWAYS:		uop_exec_effective = 1'b1;
+			UOP_EXEC_PZT1T2_0XX:	uop_exec_effective =  flag_pz_is_zero;
+			UOP_EXEC_PZT1T2_100:	uop_exec_effective = !flag_pz_is_zero && flag_t1_is_zero &&  flag_t2_is_zero;
+			UOP_EXEC_PZT1T2_101:	uop_exec_effective = !flag_pz_is_zero && flag_t1_is_zero && !flag_t2_is_zero;
+		endcase
+
+
+		//
+		// uOP Destination Store Logic
+		//
+	reg	uop_dst_wren;
+	
+	always @(*)
+		//
+		case (uop_opcode)
+			//
+			OPCODE_MOV:	uop_dst_wren = mw_mov_wren_y & uop_exec_effective;
+			OPCODE_ADD:	uop_dst_wren = mod_add_wren_s;
+			OPCODE_SUB:	uop_dst_wren = mod_sub_wren_d;
+			OPCODE_MUL:	uop_dst_wren = mod_mul_wren_p;
+			default:		uop_dst_wren = 1'b0;
+			//
+		endcase
+		
+	
+	always @(*) begin
+		//
+		//
+		//
+		if      (uop_src_a == UOP_SRC_PX) px_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_PX) px_addr = uop_src_b_addr;
+		else                              px_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if      (uop_src_a == UOP_SRC_PY) py_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_PY) py_addr = uop_src_b_addr;
+		else                              py_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if      (uop_src_a == UOP_SRC_PZ) pz_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_PZ) pz_addr = uop_src_b_addr;
+		else                              pz_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		//
+		//
+		if      (uop_src_a == UOP_SRC_ONE)   brom_one_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_ONE)   brom_one_addr = uop_src_b_addr;
+		else                                 brom_one_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		//if      (uop_src_a == UOP_SRC_ZERO)  brom_zero_addr = uop_src_a_addr;
+		//else if (uop_src_b == UOP_SRC_ZERO)  brom_zero_addr = uop_src_b_addr;
+		//else                                 brom_zero_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if      (uop_src_a == UOP_SRC_DELTA) brom_delta_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_DELTA) brom_delta_addr = uop_src_b_addr;
+		else                                 brom_delta_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		//
+		//
+		if      (uop_src_a == UOP_SRC_G_X) brom_g_x_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_G_X) brom_g_x_addr = uop_src_b_addr;
+		else                               brom_g_x_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if      (uop_src_a == UOP_SRC_G_Y) brom_g_y_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_G_Y) brom_g_y_addr = uop_src_b_addr;
+		else                               brom_g_y_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		//
+		//
+		if      (uop_src_a == UOP_SRC_H_X) brom_h_x_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_H_X) brom_h_x_addr = uop_src_b_addr;
+		else                               brom_h_x_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if      (uop_src_a == UOP_SRC_H_Y) brom_h_y_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_H_Y) brom_h_y_addr = uop_src_b_addr;
+		else                               brom_h_y_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		//
+		//
+		if      (uop_src_a == UOP_SRC_V) v_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_V) v_addr = uop_src_b_addr;
+		else                             v_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		//
+		//
+		if      (uop_src_a == UOP_SRC_T1) bram_t1_rd_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_T1) bram_t1_rd_addr = uop_src_b_addr;
+		else                              bram_t1_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if      (uop_src_a == UOP_SRC_T2) bram_t2_rd_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_T2) bram_t2_rd_addr = uop_src_b_addr;
+		else                              bram_t2_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if      (uop_src_a == UOP_SRC_T3) bram_t3_rd_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_T3) bram_t3_rd_addr = uop_src_b_addr;
+		else                              bram_t3_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if      (uop_src_a == UOP_SRC_T4) bram_t4_rd_addr = uop_src_a_addr;
+		else if (uop_src_b == UOP_SRC_T4) bram_t4_rd_addr = uop_src_b_addr;
+		else                              bram_t4_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		//
+		//
+		if (uop_dst == UOP_DST_T1) bram_t1_wr_addr = uop_dst_addr;
+		else                       bram_t1_wr_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if (uop_dst == UOP_DST_T2) bram_t2_wr_addr = uop_dst_addr;
+		else                       bram_t2_wr_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if (uop_dst == UOP_DST_T3) bram_t3_wr_addr = uop_dst_addr;
+		else                       bram_t3_wr_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		if (uop_dst == UOP_DST_T4) bram_t4_wr_addr = uop_dst_addr;
+		else                       bram_t4_wr_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		//
+		//
+		//
+		if ((uop_dst == UOP_DST_RX) && (uop_dst_wren))	rx_addr = uop_dst_addr;
+		else begin
+			if      (uop_src_a == UOP_SRC_RX) 				rx_addr = uop_src_a_addr;
+			else if (uop_src_b == UOP_SRC_RX) 				rx_addr = uop_src_b_addr;
+			else                              				rx_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		end
+		//
+		if ((uop_dst == UOP_DST_RY) && (uop_dst_wren))	ry_addr = uop_dst_addr;
+		else begin
+			if      (uop_src_a == UOP_SRC_RY) 				ry_addr = uop_src_a_addr;
+			else if (uop_src_b == UOP_SRC_RY) 				ry_addr = uop_src_b_addr;
+			else                              				ry_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		end
+		//
+		if ((uop_dst == UOP_DST_RZ) && (uop_dst_wren))	rz_addr = uop_dst_addr;
+		else begin
+			if      (uop_src_a == UOP_SRC_RZ) 				rz_addr = uop_src_a_addr;
+			else if (uop_src_b == UOP_SRC_RZ) 				rz_addr = uop_src_b_addr;
+			else                              				rz_addr = {WORD_COUNTER_WIDTH{1'bX}};
+		end
+		//
+	end
+	
+	
+	assign rx_wren = uop_dst_wren && (uop_dst == UOP_DST_RX);
+	assign ry_wren = uop_dst_wren && (uop_dst == UOP_DST_RY);
+	assign rz_wren = uop_dst_wren && (uop_dst == UOP_DST_RZ);
+	
+	assign bram_t1_wr_en = uop_dst_wren && (uop_dst == UOP_DST_T1);
+	assign bram_t2_wr_en = uop_dst_wren && (uop_dst == UOP_DST_T2);
+	assign bram_t3_wr_en = uop_dst_wren && (uop_dst == UOP_DST_T3);
+	assign bram_t4_wr_en = uop_dst_wren && (uop_dst == UOP_DST_T4);
+	
+	
+	
+		//
+		// Destination Value Selector
+		//
+	reg	[31: 0]	uop_dst_value;
+	
+	always @(*)
+		//
+		case (uop_opcode)
+		
+			OPCODE_MOV:	uop_dst_value = mw_mov_dout_y;
+			OPCODE_ADD:	uop_dst_value = mod_add_dout_s;
+			OPCODE_SUB:	uop_dst_value = mod_sub_dout_d;
+			OPCODE_MUL:	uop_dst_value = mod_mul_dout_p;
+			
+			default:		uop_dst_value = {32{1'bX}};
+			
+		endcase
+	
+	assign rx_dout = uop_dst_value;
+	assign ry_dout = uop_dst_value;
+	assign rz_dout = uop_dst_value;
+	
+	assign bram_t1_wr_data = uop_dst_value;
+	assign bram_t2_wr_data = uop_dst_value;
+	assign bram_t3_wr_data = uop_dst_value;
+	assign bram_t4_wr_data = uop_dst_value;
+
+
+		//
+		// Latch Comparison Flags
+		//
+	always @(posedge clk)
+		//
+		if (	(fsm_state  == FSM_STATE_EXECUTE) &&
+				(uop_opcode == OPCODE_CMP)        &&
+				(uop_done && !uop_trig) ) begin
+			
+			if ( (uop_src_a == UOP_SRC_PZ) && (uop_src_b == UOP_SRC_ZERO) )
+				flag_pz_is_zero <= !mw_cmp_out_l && mw_cmp_out_e && !mw_cmp_out_g;
+				
+			if ( (uop_src_a == UOP_SRC_T1) && (uop_src_b == UOP_SRC_ZERO) )
+				flag_t1_is_zero <= !mw_cmp_out_l && mw_cmp_out_e && !mw_cmp_out_g;
+				
+			if ( (uop_src_a == UOP_SRC_T2) && (uop_src_b == UOP_SRC_ZERO) )
+				flag_t2_is_zero <= !mw_cmp_out_l && mw_cmp_out_e && !mw_cmp_out_g;
+				
+		end
+
+	
+		//
+		// Ready Flag Logic
+		//
+	reg rdy_reg = 1'b1;
+	assign rdy = rdy_reg;
+		
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) rdy_reg <= 1'b1;
+		else begin
+			
+				/* clear flag */
+			if (fsm_state == FSM_STATE_STALL)
+				if (ena) rdy_reg <= 1'b0;
+			
+				/* set flag */
+			if ((fsm_state == FSM_STATE_EXECUTE) && !uop_trig && uop_done)
+				if (uop_opcode == OPCODE_RDY) rdy_reg <= 1'b1;
+				
+		end
+			
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/curve/curve_mul_256.v b/rtl/curve/curve_mul_256.v
new file mode 100644
index 0000000..0ac2be0
--- /dev/null
+++ b/rtl/curve/curve_mul_256.v
@@ -0,0 +1,720 @@
+//------------------------------------------------------------------------------
+//
+// curve_mul_256.v
+// -----------------------------------------------------------------------------
+// Elliptic curve point scalar multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module curve_mul_256
+	(
+		clk, rst_n,
+		ena, rdy,
+		k_addr, rx_addr, ry_addr,
+		rx_wren, ry_wren,
+		k_din,
+		rx_dout, ry_dout
+	);
+	
+
+		//
+		// Constants
+		//
+	localparam	WORD_COUNTER_WIDTH	= 3;	// 0 .. 7
+	localparam	OPERAND_NUM_WORDS		= 8;	// 8 * 32 = 256
+
+	
+		//
+		// Ports
+		//
+	input		wire	clk;		// system clock
+	input		wire	rst_n;	// active-low async reset
+	
+	input		wire	ena;		// enable input
+	output	wire	rdy;		// ready output
+				
+	output	wire	[ 2: 0]	k_addr;
+	output	wire	[ 2: 0]	rx_addr;
+	output	wire	[ 2: 0]	ry_addr;
+		
+	output	wire				rx_wren;
+	output	wire				ry_wren;
+
+	input		wire	[31: 0]	k_din;
+
+	output	wire	[31: 0]	rx_dout;
+	output	wire	[31: 0]	ry_dout;
+	
+
+		//
+		// Temporary Variables
+		//
+	reg	[ 2: 0]	bram_tx_wr_addr;
+	reg	[ 2: 0]	bram_ty_wr_addr;
+	reg	[ 2: 0]	bram_tz_wr_addr;
+	
+	reg	[ 2: 0]	bram_rx_wr_addr;
+	reg	[ 2: 0]	bram_ry_wr_addr;
+	reg	[ 2: 0]	bram_rz_wr_addr;
+	wire	[ 2: 0]	bram_rz1_wr_addr;
+	
+	reg	[ 2: 0]	bram_tx_rd_addr;
+	reg	[ 2: 0]	bram_ty_rd_addr;
+	reg	[ 2: 0]	bram_tz_rd_addr;
+	
+	reg	[ 2: 0]	bram_rx_rd_addr;
+	reg	[ 2: 0]	bram_ry_rd_addr;
+	reg	[ 2: 0]	bram_rz_rd_addr;
+	wire	[ 2: 0]	bram_rz1_rd_addr;
+	
+	reg				bram_tx_wr_en;
+	reg				bram_ty_wr_en;
+	reg				bram_tz_wr_en;
+
+	reg				bram_rx_wr_en;
+	reg				bram_ry_wr_en;
+	reg				bram_rz_wr_en;
+	wire				bram_rz1_wr_en;
+	
+	wire	[31: 0]	bram_tx_rd_data;
+	wire	[31: 0]	bram_ty_rd_data;
+	wire	[31: 0]	bram_tz_rd_data;	
+
+	wire	[31: 0]	bram_rx_rd_data;
+	wire	[31: 0]	bram_ry_rd_data;
+	wire	[31: 0]	bram_rz_rd_data;
+	wire	[31: 0]	bram_rz1_rd_data;
+	
+	reg	[31: 0]	bram_tx_wr_data_in;
+	reg	[31: 0]	bram_ty_wr_data_in;
+	reg	[31: 0]	bram_tz_wr_data_in;
+
+	reg	[31: 0]	bram_rx_wr_data_in;
+	reg	[31: 0]	bram_ry_wr_data_in;
+	reg	[31: 0]	bram_rz_wr_data_in;
+	wire	[31: 0]	bram_rz1_wr_data_in;
+	
+	wire	[31: 0]	bram_tx_wr_data_out;
+	wire	[31: 0]	bram_ty_wr_data_out;
+	wire	[31: 0]	bram_tz_wr_data_out;
+	
+	wire	[31: 0]	bram_rx_wr_data_out;
+	wire	[31: 0]	bram_ry_wr_data_out;
+	wire	[31: 0]	bram_rz_wr_data_out;
+		
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_tx (.clk(clk),
+		.a_addr(bram_tx_wr_addr), .a_wr(bram_tx_wr_en), .a_in(bram_tx_wr_data_in), .a_out(bram_tx_wr_data_out),
+		.b_addr(bram_tx_rd_addr),                                                  .b_out(bram_tx_rd_data));
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_ty (.clk(clk),
+		.a_addr(bram_ty_wr_addr), .a_wr(bram_ty_wr_en), .a_in(bram_ty_wr_data_in), .a_out(bram_ty_wr_data_out),
+		.b_addr(bram_ty_rd_addr),                                                  .b_out(bram_ty_rd_data));
+
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_tz (.clk(clk),
+		.a_addr(bram_tz_wr_addr), .a_wr(bram_tz_wr_en), .a_in(bram_tz_wr_data_in), .a_out(bram_tz_wr_data_out),
+		.b_addr(bram_tz_rd_addr),                                                  .b_out(bram_tz_rd_data));
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_rx (.clk(clk),
+		.a_addr(bram_rx_wr_addr), .a_wr(bram_rx_wr_en), .a_in(bram_rx_wr_data_in), .a_out(bram_rx_wr_data_out),
+		.b_addr(bram_rx_rd_addr),                                                  .b_out(bram_rx_rd_data));
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_ry (.clk(clk),
+		.a_addr(bram_ry_wr_addr), .a_wr(bram_ry_wr_en), .a_in(bram_ry_wr_data_in), .a_out(bram_ry_wr_data_out),
+		.b_addr(bram_ry_rd_addr),                                                  .b_out(bram_ry_rd_data));
+
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_rz (.clk(clk),
+		.a_addr(bram_rz_wr_addr), .a_wr(bram_rz_wr_en), .a_in(bram_rz_wr_data_in), .a_out(bram_rz_wr_data_out),
+		.b_addr(bram_rz_rd_addr),                                                  .b_out(bram_rz_rd_data));
+		
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(3))
+	bram_rz1 (.clk(clk),
+		.a_addr(bram_rz1_wr_addr), .a_wr(bram_rz1_wr_en), .a_in(bram_rz1_wr_data_in), .a_out(),
+		.b_addr(bram_rz1_rd_addr),                                                    .b_out(bram_rz1_rd_data));
+
+
+		//
+		// FSM
+		//
+	localparam	[ 3: 0]	FSM_STATE_IDLE				= 4'd00;
+	localparam	[ 3: 0]	FSM_STATE_PREPARE_TRIG	= 4'd01;
+	localparam	[ 3: 0]	FSM_STATE_PREPARE_WAIT	= 4'd02;
+	localparam	[ 3: 0]	FSM_STATE_DOUBLE_TRIG	= 4'd03;
+	localparam	[ 3: 0]	FSM_STATE_DOUBLE_WAIT	= 4'd04;
+	localparam	[ 3: 0]	FSM_STATE_ADD_TRIG		= 4'd05;
+	localparam	[ 3: 0]	FSM_STATE_ADD_WAIT		= 4'd06;
+	localparam	[ 3: 0]	FSM_STATE_COPY_TRIG		= 4'd07;
+	localparam	[ 3: 0]	FSM_STATE_COPY_WAIT		= 4'd08;
+	localparam	[ 3: 0]	FSM_STATE_INVERT_TRIG	= 4'd09;
+	localparam	[ 3: 0]	FSM_STATE_INVERT_WAIT	= 4'd10;
+	localparam	[ 3: 0]	FSM_STATE_CONVERT_TRIG	= 4'd11;
+	localparam	[ 3: 0]	FSM_STATE_CONVERT_WAIT	= 4'd12;
+	localparam	[ 3: 0]	FSM_STATE_DONE				= 4'd13;
+	
+	reg [3:0] fsm_state = FSM_STATE_IDLE;
+	
+
+		//
+		// Round Counter
+		//
+	reg	[ 7: 0]	bit_counter;
+	wire	[ 7: 0]	bit_counter_max = 8'd255;
+	wire	[ 7: 0]	bit_counter_zero = 8'd0;
+	wire	[ 7: 0]	bit_counter_next =
+		(bit_counter < bit_counter_max) ? bit_counter + 1'b1 : bit_counter_zero;
+		
+		
+		//
+		// Round Completion
+		//
+	wire [ 3: 0]	fsm_state_round_next = (bit_counter < bit_counter_max) ?
+		FSM_STATE_DOUBLE_TRIG : FSM_STATE_INVERT_TRIG;
+		
+
+		//
+		// OP Trigger Logic
+		//
+	reg	op_trig;
+	wire	op_done;
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	op_trig <= 1'b0;
+		else						op_trig <=	(fsm_state == FSM_STATE_PREPARE_TRIG) ||
+													(fsm_state == FSM_STATE_DOUBLE_TRIG) ||
+													(fsm_state == FSM_STATE_ADD_TRIG) ||
+													(fsm_state == FSM_STATE_CONVERT_TRIG);
+		
+		//
+		// Microprograms
+		//
+	wire	[ 5: 0]	op_rom_addr;
+	wire	[19: 0]	op_rom_init_data;
+	wire	[19: 0]	op_rom_dbl_data;
+	wire	[19: 0]	op_rom_add_data;
+	wire	[19: 0]	op_rom_conv_data;
+	reg	[19: 0]	op_rom_mux_data;
+
+	(* RAM_STYLE="BLOCK" *)
+	uop_init_rom op_rom_init
+	(
+		.clk	(clk),
+		.addr	(op_rom_addr),
+		.data	(op_rom_init_data)
+	);
+	
+	(* RAM_STYLE="BLOCK" *)
+	uop_dbl_rom op_rom_dbl
+	(
+		.clk	(clk),
+		.addr	(op_rom_addr),
+		.data	(op_rom_dbl_data)
+	);
+
+	(* RAM_STYLE="BLOCK" *)
+	uop_add_rom op_rom_add
+	(
+		.clk	(clk),
+		.addr	(op_rom_addr),
+		.data	(op_rom_add_data)
+	);
+	
+	(* RAM_STYLE="BLOCK" *)
+	uop_conv_rom op_rom_conv
+	(
+		.clk	(clk),
+		.addr	(op_rom_addr),
+		.data	(op_rom_conv_data)
+	);
+	
+	always @(*)
+		//
+		case (fsm_state)
+			FSM_STATE_PREPARE_WAIT:	op_rom_mux_data = op_rom_init_data;
+			FSM_STATE_DOUBLE_WAIT:	op_rom_mux_data = op_rom_dbl_data;
+			FSM_STATE_ADD_WAIT:		op_rom_mux_data = op_rom_add_data;
+			FSM_STATE_CONVERT_WAIT:	op_rom_mux_data = op_rom_conv_data;
+			default:						op_rom_mux_data = {20{1'bX}};
+		endcase
+
+
+	
+		//
+		// Modulus
+		//
+	reg	[ 2: 0]	rom_q_addr;
+	wire	[31: 0]	rom_q_data;
+	
+	brom_p256_q rom_q
+   (
+		.clk		(clk),
+		.b_addr	(rom_q_addr),
+		.b_out	(rom_q_data)
+    );
+
+
+		//
+		// Worker
+		//
+	wire	[ 2: 0]	worker_addr_px;
+	wire	[ 2: 0]	worker_addr_py;
+	wire	[ 2: 0]	worker_addr_pz;
+	
+	wire	[ 2: 0]	worker_addr_rx;
+	wire	[ 2: 0]	worker_addr_ry;
+	wire	[ 2: 0]	worker_addr_rz;
+
+	wire	[ 2: 0]	worker_addr_q;
+	
+	wire				worker_wren_rx;
+	wire				worker_wren_ry;
+	wire				worker_wren_rz;
+	
+	reg	[31: 0]	worker_din_px;
+	reg	[31: 0]	worker_din_py;
+	reg	[31: 0]	worker_din_pz;
+	
+	reg	[31: 0]	worker_din_rx;
+	reg	[31: 0]	worker_din_ry;
+	reg	[31: 0]	worker_din_rz;
+	
+	wire	[31: 0]	worker_dout_rx;
+	wire	[31: 0]	worker_dout_ry;
+	wire	[31: 0]	worker_dout_rz;
+	
+	curve_dbl_add_256 worker
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(op_trig),
+		.rdy			(op_done),
+		
+		.uop_addr	(op_rom_addr),
+		.uop			(op_rom_mux_data),
+		
+		.px_addr		(worker_addr_px),
+		.py_addr		(worker_addr_py),
+		.pz_addr		(worker_addr_pz),
+		
+		.rx_addr		(worker_addr_rx),
+		.ry_addr		(worker_addr_ry),
+		.rz_addr		(worker_addr_rz),
+		
+		.q_addr		(worker_addr_q),
+		
+		.v_addr		(bram_rz1_rd_addr),
+		
+		.rx_wren		(worker_wren_rx),
+		.ry_wren		(worker_wren_ry),
+		.rz_wren		(worker_wren_rz),
+		
+		.px_din		(worker_din_px),
+		.py_din		(worker_din_py),
+		.pz_din		(worker_din_pz),
+		
+		.rx_din		(worker_din_rx),
+		.ry_din		(worker_din_ry),
+		.rz_din		(worker_din_rz),
+		
+		.rx_dout		(worker_dout_rx),
+		.ry_dout		(worker_dout_ry),
+		.rz_dout		(worker_dout_rz),
+		
+		.q_din		(rom_q_data),
+		
+		.v_din		(bram_rz1_rd_data)
+	);
+
+	
+		//
+		// Mover
+		//
+	reg	move_trig;
+	wire	move_done;
+	
+	wire	[ 2: 0]	mover_addr_x;
+	wire	[ 2: 0]	mover_addr_y;
+	
+	wire				mover_wren_y;
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	move_trig <= 1'b0;
+		else						move_trig <= (fsm_state == FSM_STATE_COPY_TRIG);
+
+	mw_mover #
+	(
+		.WORD_COUNTER_WIDTH	(3),
+		.OPERAND_NUM_WORDS	(8)
+	)
+	mover
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(move_trig),
+		.rdy		(move_done),
+		
+		.x_addr	(mover_addr_x),
+		.y_addr	(mover_addr_y),
+		.y_wren	(mover_wren_y),
+		
+		.x_din	({32{1'bX}}),
+		.y_dout	()
+	);
+
+
+		//
+		// Invertor
+		//
+	reg	invert_trig;
+	wire	invert_done;
+
+	wire	[ 2: 0]	invertor_addr_a;
+	wire	[ 2: 0]	invertor_addr_q;
+
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	invert_trig <= 1'b0;
+		else						invert_trig <= (fsm_state == FSM_STATE_INVERT_TRIG);
+	
+	modular_invertor #
+	(
+		.MAX_OPERAND_WIDTH(256)
+	)
+	invertor
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(invert_trig),
+		.rdy			(invert_done),
+		
+		.a_addr		(invertor_addr_a),
+		.q_addr		(invertor_addr_q),
+		.a1_addr		(bram_rz1_wr_addr),
+		.a1_wren		(bram_rz1_wr_en),
+		
+		.a_din		(bram_rz_rd_data),
+		.q_din		(rom_q_data),
+		.a1_dout		(bram_rz1_wr_data_in)
+	);
+	
+	
+		//
+		// FSM Transition Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)			fsm_state <= FSM_STATE_IDLE;
+		else case (fsm_state)
+		
+			FSM_STATE_IDLE:			fsm_state <= ena ? FSM_STATE_PREPARE_TRIG : FSM_STATE_IDLE;
+			
+			FSM_STATE_PREPARE_TRIG:	fsm_state <= FSM_STATE_PREPARE_WAIT;
+			FSM_STATE_PREPARE_WAIT:	fsm_state <= (!op_trig && op_done) ? FSM_STATE_DOUBLE_TRIG : FSM_STATE_PREPARE_WAIT;
+			
+			FSM_STATE_DOUBLE_TRIG:	fsm_state <= FSM_STATE_DOUBLE_WAIT;
+			FSM_STATE_DOUBLE_WAIT:	fsm_state <= (!op_trig && op_done) ? FSM_STATE_ADD_TRIG : FSM_STATE_DOUBLE_WAIT;
+			
+			FSM_STATE_ADD_TRIG:		fsm_state <= FSM_STATE_ADD_WAIT;
+			FSM_STATE_ADD_WAIT:		fsm_state <= (!op_trig && op_done) ? FSM_STATE_COPY_TRIG : FSM_STATE_ADD_WAIT;
+			
+			FSM_STATE_COPY_TRIG:		fsm_state <= FSM_STATE_COPY_WAIT;
+			FSM_STATE_COPY_WAIT:		fsm_state <= (!move_trig && move_done) ? fsm_state_round_next : FSM_STATE_COPY_WAIT;
+			
+			FSM_STATE_INVERT_TRIG:	fsm_state <= FSM_STATE_INVERT_WAIT;
+			FSM_STATE_INVERT_WAIT:	fsm_state <= (!invert_trig && invert_done) ? FSM_STATE_CONVERT_TRIG : FSM_STATE_INVERT_WAIT;
+			
+			FSM_STATE_CONVERT_TRIG:	fsm_state <= FSM_STATE_CONVERT_WAIT;
+			FSM_STATE_CONVERT_WAIT:	fsm_state <= (!op_trig && op_done) ? FSM_STATE_DONE : FSM_STATE_CONVERT_WAIT;
+			
+			FSM_STATE_DONE:			fsm_state <= FSM_STATE_IDLE;
+			
+			default:						fsm_state <= FSM_STATE_IDLE;
+					
+		endcase
+	
+	
+		//
+		// Bit Counter Increment
+		//
+	always @(posedge clk) begin
+		//
+		if ((fsm_state == FSM_STATE_PREPARE_WAIT) && !op_trig && op_done)
+			bit_counter <= bit_counter_zero;
+		//	
+		if ((fsm_state == FSM_STATE_COPY_WAIT) && !move_trig && move_done)
+			bit_counter <= bit_counter_next;
+		//
+	end
+
+
+		//
+		// K Latch Logic
+		//
+	reg	[ 2: 0]	k_addr_reg;
+	reg	[31: 0]	k_din_reg;
+	
+	assign k_addr = k_addr_reg;
+	
+	always @(posedge clk) begin
+		//
+		if (fsm_state == FSM_STATE_DOUBLE_TRIG)
+			k_addr_reg <= 3'd7 - bit_counter[7:5];
+		//
+		if (fsm_state == FSM_STATE_ADD_TRIG)
+			k_din_reg <= (bit_counter[4:0] == 5'd0) ? k_din : {k_din_reg[30:0], 1'bX};
+		//
+	end
+			
+		
+		
+		//
+		// Copy Inhibit Logic
+		//
+	wire	move_inhibit = k_din_reg[31];
+	
+	wire	copy_t2r_int = mover_wren_y & ~move_inhibit;
+		
+	
+	always @(*) begin
+		//
+		// Q
+		//
+		case (fsm_state)
+			FSM_STATE_DOUBLE_WAIT:	rom_q_addr = worker_addr_q;
+			FSM_STATE_ADD_WAIT:		rom_q_addr = worker_addr_q;
+			FSM_STATE_INVERT_WAIT:	rom_q_addr = invertor_addr_q;
+			FSM_STATE_CONVERT_WAIT:	rom_q_addr = worker_addr_q;
+			default:						rom_q_addr = worker_addr_q;
+		endcase
+			
+		//
+		// R(X,Y,Z)
+		//
+		case (fsm_state)
+			//
+			FSM_STATE_PREPARE_WAIT: begin
+				//
+				bram_rx_rd_addr    <= {3{1'bX}};      bram_ry_rd_addr    <= {3{1'bX}};      bram_rz_rd_addr    <= {3{1'bX}};
+				bram_rx_wr_addr    <= worker_addr_rx; bram_ry_wr_addr    <= worker_addr_ry; bram_rz_wr_addr    <= worker_addr_rz;
+				bram_rx_wr_en      <= worker_wren_rx; bram_ry_wr_en      <= worker_wren_ry; bram_rz_wr_en      <= worker_wren_rz;
+				bram_rx_wr_data_in <= worker_dout_rx; bram_ry_wr_data_in <= worker_dout_ry; bram_rz_wr_data_in <= worker_dout_rz;
+				//
+			end
+			//
+			FSM_STATE_DOUBLE_WAIT: begin
+				//
+				bram_rx_rd_addr    <= worker_addr_px; bram_ry_rd_addr    <= worker_addr_py; bram_rz_rd_addr    <= worker_addr_pz;
+				bram_rx_wr_addr    <= {3{1'bX}};      bram_ry_wr_addr    <= {3{1'bX}};      bram_rz_wr_addr    <= {3{1'bX}};
+				bram_rx_wr_en      <= 1'b0;           bram_ry_wr_en      <= 1'b0;           bram_rz_wr_en      <= 1'b0;
+				bram_rx_wr_data_in <= {32{1'bX}};     bram_ry_wr_data_in <= {32{1'bX}};     bram_rz_wr_data_in <= {32{1'bX}};
+				//
+			end
+			//
+			FSM_STATE_ADD_WAIT: begin
+				//
+				bram_rx_rd_addr    <= {3{1'bX}};      bram_ry_rd_addr    <= {3{1'bX}};      bram_rz_rd_addr    <= {3{1'bX}};
+				bram_rx_wr_addr    <= worker_addr_rx; bram_ry_wr_addr    <= worker_addr_ry; bram_rz_wr_addr    <= worker_addr_rz;
+				bram_rx_wr_en      <= worker_wren_rx; bram_ry_wr_en      <= worker_wren_ry; bram_rz_wr_en      <= worker_wren_rz;
+				bram_rx_wr_data_in <= worker_dout_rx; bram_ry_wr_data_in <= worker_dout_ry; bram_rz_wr_data_in <= worker_dout_rz;
+				//
+			end
+			//
+			FSM_STATE_COPY_WAIT: begin
+				//
+				bram_rx_rd_addr    <= {3{1'bX}};       bram_ry_rd_addr    <= {3{1'bX}};       bram_rz_rd_addr    <= {3{1'bX}};
+				bram_rx_wr_addr    <= mover_addr_y;    bram_ry_wr_addr    <= mover_addr_y;    bram_rz_wr_addr    <= mover_addr_y;
+				bram_rx_wr_en      <= copy_t2r_int;    bram_ry_wr_en      <= copy_t2r_int;    bram_rz_wr_en      <= copy_t2r_int;
+				bram_rx_wr_data_in <= bram_tx_rd_data; bram_ry_wr_data_in <= bram_ty_rd_data; bram_rz_wr_data_in <= bram_tz_rd_data;
+				//
+			end
+			//
+			FSM_STATE_INVERT_WAIT: begin
+				//
+				bram_rx_rd_addr    <= {3{1'bX}};  bram_ry_rd_addr    <= {3{1'bX}};  bram_rz_rd_addr    <= invertor_addr_a;
+				bram_rx_wr_addr    <= {3{1'bX}};  bram_ry_wr_addr    <= {3{1'bX}};  bram_rz_wr_addr    <= {3{1'bX}};
+				bram_rx_wr_en      <= 1'b0;       bram_ry_wr_en      <= 1'b0;       bram_rz_wr_en      <= 1'b0;
+				bram_rx_wr_data_in <= {32{1'bX}}; bram_ry_wr_data_in <= {32{1'bX}}; bram_rz_wr_data_in <= {32{1'bX}};
+				//
+			end
+			//
+			FSM_STATE_CONVERT_WAIT: begin
+				//
+				bram_rx_rd_addr    <= worker_addr_px; bram_ry_rd_addr    <= worker_addr_py; bram_rz_rd_addr    <= worker_addr_pz;
+				bram_rx_wr_addr    <= {3{1'bX}};      bram_ry_wr_addr    <= {3{1'bX}};      bram_rz_wr_addr    <= {3{1'bX}};
+				bram_rx_wr_en      <= 1'b0;           bram_ry_wr_en      <= 1'b0;           bram_rz_wr_en      <= 1'b0;
+				bram_rx_wr_data_in <= {32{1'bX}};     bram_ry_wr_data_in <= {32{1'bX}};     bram_rz_wr_data_in <= {32{1'bX}};
+				//
+			end
+
+			//
+			default: begin
+				//
+				bram_rx_rd_addr    <= {3{1'bX}};  bram_ry_rd_addr    <= {3{1'bX}};  bram_rz_rd_addr    <= {3{1'bX}};
+				bram_rx_wr_addr    <= {3{1'bX}};  bram_ry_wr_addr    <= {3{1'bX}};  bram_rz_wr_addr    <= {3{1'bX}};
+				bram_rx_wr_en      <= 1'b0;       bram_ry_wr_en      <= 1'b0;       bram_rz_wr_en      <= 1'b0;
+				bram_rx_wr_data_in <= {32{1'bX}}; bram_ry_wr_data_in <= {32{1'bX}}; bram_rz_wr_data_in <= {32{1'bX}};
+				//
+			end
+			//
+		endcase
+		//
+		// T(X,Y,Z)
+		//
+		case (fsm_state)
+			//
+			FSM_STATE_DOUBLE_WAIT: begin
+				//
+				bram_tx_rd_addr    <= {3{1'bX}};      bram_ty_rd_addr    <= {3{1'bX}};      bram_tz_rd_addr    <= {3{1'bX}};
+				bram_tx_wr_addr    <= worker_addr_rx; bram_ty_wr_addr    <= worker_addr_ry; bram_tz_wr_addr    <= worker_addr_rz;
+				bram_tx_wr_en      <= worker_wren_rx; bram_ty_wr_en      <= worker_wren_ry; bram_tz_wr_en      <= worker_wren_rz;
+				bram_tx_wr_data_in <= worker_dout_rx; bram_ty_wr_data_in <= worker_dout_ry; bram_tz_wr_data_in <= worker_dout_rz;
+				//
+			end
+			//
+			FSM_STATE_ADD_WAIT: begin
+				//
+				bram_tx_rd_addr    <= worker_addr_px; bram_ty_rd_addr    <= worker_addr_py; bram_tz_rd_addr    <= worker_addr_pz;
+				bram_tx_wr_addr    <= {3{1'bX}};      bram_ty_wr_addr    <= {3{1'bX}};      bram_tz_wr_addr    <= {3{1'bX}};
+				bram_tx_wr_en      <= 1'b0;           bram_ty_wr_en      <= 1'b0;           bram_tz_wr_en      <= 1'b0;
+				bram_tx_wr_data_in <= {32{1'bX}};     bram_ty_wr_data_in <= {32{1'bX}};     bram_tz_wr_data_in <= {32{1'bX}};
+				//
+			end
+			//
+			FSM_STATE_COPY_WAIT: begin
+				//
+				bram_tx_rd_addr    <= mover_addr_x; bram_ty_rd_addr    <= mover_addr_x; bram_tz_rd_addr    <= mover_addr_x;
+				bram_tx_wr_addr    <= {3{1'bX}};    bram_ty_wr_addr    <= {3{1'bX}};    bram_tz_wr_addr    <= {3{1'bX}};
+				bram_tx_wr_en      <= 1'b0;         bram_ty_wr_en      <= 1'b0;         bram_tz_wr_en      <= 1'b0;
+				bram_tx_wr_data_in <= {32{1'bX}};   bram_ty_wr_data_in <= {32{1'bX}};   bram_tz_wr_data_in <= {32{1'bX}};
+				//
+			end
+			
+			//
+			default: begin
+				//
+				bram_tx_rd_addr    <= {3{1'bX}};  bram_ty_rd_addr    <= {3{1'bX}};  bram_tz_rd_addr    <= {3{1'bX}};
+				bram_tx_wr_addr    <= {3{1'bX}};  bram_ty_wr_addr    <= {3{1'bX}};  bram_tz_wr_addr    <= {3{1'bX}};
+				bram_tx_wr_en      <= 1'b0;       bram_ty_wr_en      <= 1'b0;       bram_tz_wr_en      <= 1'b0;
+				bram_tx_wr_data_in <= {32{1'bX}}; bram_ty_wr_data_in <= {32{1'bX}}; bram_tz_wr_data_in <= {32{1'bX}};
+				//
+			end
+			//
+		endcase
+		//
+		// Worker
+		//
+		case (fsm_state)
+			//
+			FSM_STATE_DOUBLE_WAIT: begin
+				//
+				worker_din_px <= bram_rx_rd_data;     worker_din_py <= bram_ry_rd_data;     worker_din_pz <= bram_rz_rd_data;
+				worker_din_rx <= bram_tx_wr_data_out; worker_din_ry <= bram_ty_wr_data_out; worker_din_rz <= bram_tz_wr_data_out;
+				//
+			end
+			//
+			FSM_STATE_ADD_WAIT: begin
+				//
+				worker_din_px <= bram_tx_rd_data;     worker_din_py <= bram_ty_rd_data;     worker_din_pz <= bram_tz_rd_data;
+				worker_din_rx <= bram_rx_wr_data_out; worker_din_ry <= bram_ry_wr_data_out; worker_din_rz <= bram_rz_wr_data_out;
+				//
+			end
+			//
+			FSM_STATE_CONVERT_WAIT: begin
+				//
+				worker_din_px <= bram_rx_rd_data; worker_din_py <= bram_ry_rd_data; worker_din_pz <= bram_rz_rd_data;
+				worker_din_rx <= {32{1'bX}};      worker_din_ry <= {32{1'bX}};      worker_din_rz <= {32{1'bX}};
+				//
+			end
+			//
+			default: begin
+				//
+				worker_din_px <= {32{1'bX}}; worker_din_py <= {32{1'bX}}; worker_din_pz <= {32{1'bX}};
+				worker_din_rx <= {32{1'bX}}; worker_din_ry <= {32{1'bX}}; worker_din_rz <= {32{1'bX}};
+				//
+			end
+			//
+		endcase
+		//
+	end
+
+
+		//
+		// Output Mapping
+		//
+	assign	rx_wren = worker_wren_rx && (fsm_state == FSM_STATE_CONVERT_WAIT);
+	assign	ry_wren = worker_wren_ry && (fsm_state == FSM_STATE_CONVERT_WAIT);
+
+	assign	rx_dout = worker_dout_rx;
+	assign	ry_dout = worker_dout_ry;
+	
+	assign	rx_addr = worker_addr_rx;
+	assign	ry_addr = worker_addr_ry;
+
+	
+		//
+		// Ready Flag Logic
+		//
+	reg rdy_reg = 1'b1;
+	assign rdy = rdy_reg;
+		
+	always @(posedge clk or negedge rst_n)
+		
+		if (rst_n == 1'b0) rdy_reg <= 1'b1;
+		else begin
+			
+				/* clear flag */
+			if ((fsm_state == FSM_STATE_IDLE) && ena)
+				rdy_reg <= 1'b0;
+			
+				/* set flag */
+			if (fsm_state == FSM_STATE_DONE)
+				rdy_reg <= 1'b1;
+				
+		end
+			
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/curve/rom/brom_p256_delta.v b/rtl/curve/rom/brom_p256_delta.v
new file mode 100644
index 0000000..b9a345a
--- /dev/null
+++ b/rtl/curve/rom/brom_p256_delta.v
@@ -0,0 +1,68 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module brom_p256_delta
+   (
+		input		wire				clk,
+		input		wire	[ 3-1:0]	b_addr,
+		output	wire	[32-1:0]	b_out
+    );
+
+
+   //
+   // Output Registers
+   //
+   reg [31:0] bram_reg_b;
+
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Only Port B
+	//
+	always @(posedge clk)
+		//
+		case (b_addr)
+			3'b000:	bram_reg_b <= 32'h00000000;
+			3'b001:	bram_reg_b <= 32'h00000000;
+			3'b010:	bram_reg_b <= 32'h80000000;
+			3'b011:	bram_reg_b <= 32'h00000000;
+			3'b100:	bram_reg_b <= 32'h00000000;
+			3'b101:	bram_reg_b <= 32'h80000000;
+			3'b110:	bram_reg_b <= 32'h80000000;
+			3'b111:	bram_reg_b <= 32'h7fffffff;
+		endcase
+
+
+endmodule
diff --git a/rtl/curve/rom/brom_p256_g_x.v b/rtl/curve/rom/brom_p256_g_x.v
new file mode 100644
index 0000000..0816ef6
--- /dev/null
+++ b/rtl/curve/rom/brom_p256_g_x.v
@@ -0,0 +1,68 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module brom_p256_g_x
+   (
+		input		wire				clk,
+		input		wire	[ 3-1:0]	b_addr,
+		output	wire	[32-1:0]	b_out
+    );
+
+
+   //
+   // Output Registers
+   //
+   reg [31:0] bram_reg_b;
+
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Only Port B
+	//
+	always @(posedge clk)
+		//
+		case (b_addr)
+			3'b000:	bram_reg_b <= 32'hd898c296;
+			3'b001:	bram_reg_b <= 32'hf4a13945;
+			3'b010:	bram_reg_b <= 32'h2deb33a0;
+			3'b011:	bram_reg_b <= 32'h77037d81;
+			3'b100:	bram_reg_b <= 32'h63a440f2;
+			3'b101:	bram_reg_b <= 32'hf8bce6e5;
+			3'b110:	bram_reg_b <= 32'he12c4247;
+			3'b111:	bram_reg_b <= 32'h6b17d1f2;
+		endcase
+
+
+endmodule
diff --git a/rtl/curve/rom/brom_p256_g_y.v b/rtl/curve/rom/brom_p256_g_y.v
new file mode 100644
index 0000000..4d9c61e
--- /dev/null
+++ b/rtl/curve/rom/brom_p256_g_y.v
@@ -0,0 +1,68 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module brom_p256_g_y
+   (
+		input		wire				clk,
+		input		wire	[ 3-1:0]	b_addr,
+		output	wire	[32-1:0]	b_out
+    );
+
+
+   //
+   // Output Registers
+   //
+   reg [31:0] bram_reg_b;
+
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Only Port B
+	//
+	always @(posedge clk)
+		//
+		case (b_addr)
+			3'b000:	bram_reg_b <= 32'h37bf51f5;
+			3'b001:	bram_reg_b <= 32'hcbb64068;
+			3'b010:	bram_reg_b <= 32'h6b315ece;
+			3'b011:	bram_reg_b <= 32'h2bce3357;
+			3'b100:	bram_reg_b <= 32'h7c0f9e16;
+			3'b101:	bram_reg_b <= 32'h8ee7eb4a;
+			3'b110:	bram_reg_b <= 32'hfe1a7f9b;
+			3'b111:	bram_reg_b <= 32'h4fe342e2;
+		endcase
+
+
+endmodule
diff --git a/rtl/curve/rom/brom_p256_h_x.v b/rtl/curve/rom/brom_p256_h_x.v
new file mode 100644
index 0000000..0b69f77
--- /dev/null
+++ b/rtl/curve/rom/brom_p256_h_x.v
@@ -0,0 +1,68 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module brom_p256_h_x
+   (
+		input		wire				clk,
+		input		wire	[ 3-1:0]	b_addr,
+		output	wire	[32-1:0]	b_out
+    );
+
+
+   //
+   // Output Registers
+   //
+   reg [31:0] bram_reg_b;
+
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Only Port B
+	//
+	always @(posedge clk)
+		//
+		case (b_addr)
+			3'b000:	bram_reg_b <= 32'h4ece7ad0;
+			3'b001:	bram_reg_b <= 32'h16bd8d74;
+			3'b010:	bram_reg_b <= 32'ha42998be;
+			3'b011:	bram_reg_b <= 32'h11f904fe;
+			3'b100:	bram_reg_b <= 32'h38b77e1b;
+			3'b101:	bram_reg_b <= 32'h0e863235;
+			3'b110:	bram_reg_b <= 32'h3da77b71;
+			3'b111:	bram_reg_b <= 32'h29d05c19;
+		endcase
+
+
+endmodule
diff --git a/rtl/curve/rom/brom_p256_h_y.v b/rtl/curve/rom/brom_p256_h_y.v
new file mode 100644
index 0000000..362fce6
--- /dev/null
+++ b/rtl/curve/rom/brom_p256_h_y.v
@@ -0,0 +1,68 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module brom_p256_h_y
+   (
+		input		wire				clk,
+		input		wire	[ 3-1:0]	b_addr,
+		output	wire	[32-1:0]	b_out
+    );
+
+
+   //
+   // Output Registers
+   //
+   reg [31:0] bram_reg_b;
+
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Only Port B
+	//
+	always @(posedge clk)
+		//
+		case (b_addr)
+			3'b000:	bram_reg_b <= 32'hc840ae07;
+			3'b001:	bram_reg_b <= 32'h3449bf97;
+			3'b010:	bram_reg_b <= 32'h94cea131;
+			3'b011:	bram_reg_b <= 32'hd431cca9;
+			3'b100:	bram_reg_b <= 32'h83f061e9;
+			3'b101:	bram_reg_b <= 32'h711814b5;
+			3'b110:	bram_reg_b <= 32'h01e58065;
+			3'b111:	bram_reg_b <= 32'hb01cbd1c;
+		endcase
+
+
+endmodule
diff --git a/rtl/curve/rom/brom_p256_one.v b/rtl/curve/rom/brom_p256_one.v
new file mode 100644
index 0000000..4097874
--- /dev/null
+++ b/rtl/curve/rom/brom_p256_one.v
@@ -0,0 +1,68 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module brom_p256_one
+   (
+		input		wire				clk,
+		input		wire	[ 3-1:0]	b_addr,
+		output	wire	[32-1:0]	b_out
+    );
+
+
+   //
+   // Output Registers
+   //
+   reg [31:0] bram_reg_b;
+
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Only Port B
+	//
+	always @(posedge clk)
+		//
+		case (b_addr)
+			3'b000:	bram_reg_b <= 32'h00000001;
+			3'b001:	bram_reg_b <= 32'h00000000;
+			3'b010:	bram_reg_b <= 32'h00000000;
+			3'b011:	bram_reg_b <= 32'h00000000;
+			3'b100:	bram_reg_b <= 32'h00000000;
+			3'b101:	bram_reg_b <= 32'h00000000;
+			3'b110:	bram_reg_b <= 32'h00000000;
+			3'b111:	bram_reg_b <= 32'h00000000;
+		endcase
+
+
+endmodule
diff --git a/rtl/curve/rom/brom_p256_q.v b/rtl/curve/rom/brom_p256_q.v
new file mode 100644
index 0000000..fe94593
--- /dev/null
+++ b/rtl/curve/rom/brom_p256_q.v
@@ -0,0 +1,68 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module brom_p256_q
+   (
+		input		wire				clk,
+		input		wire	[ 3-1:0]	b_addr,
+		output	wire	[32-1:0]	b_out
+    );
+
+
+   //
+   // Output Registers
+   //
+   reg [31:0] bram_reg_b;
+
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Only Port B
+	//
+	always @(posedge clk)
+		//
+		case (b_addr)
+			3'b000:	bram_reg_b <= 32'hffffffff;
+			3'b001:	bram_reg_b <= 32'hffffffff;
+			3'b010:	bram_reg_b <= 32'hffffffff;
+			3'b011:	bram_reg_b <= 32'h00000000;
+			3'b100:	bram_reg_b <= 32'h00000000;
+			3'b101:	bram_reg_b <= 32'h00000000;
+			3'b110:	bram_reg_b <= 32'h00000001;
+			3'b111:	bram_reg_b <= 32'hffffffff;
+		endcase
+
+
+endmodule
diff --git a/rtl/curve/rom/brom_p256_zero.v b/rtl/curve/rom/brom_p256_zero.v
new file mode 100644
index 0000000..f6d19a1
--- /dev/null
+++ b/rtl/curve/rom/brom_p256_zero.v
@@ -0,0 +1,70 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module brom_p256_zero
+   (
+		//input		wire				clk,
+		//input		wire	[ 3-1:0]	b_addr,
+		output	wire	[32-1:0]	b_out
+    );
+
+
+	assign b_out = {32{1'b0}};
+
+   //
+   // Output Registers
+   //
+   //reg [31:0] bram_reg_b;
+
+   //assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Only Port B
+	//
+	//always @(posedge clk)
+		//
+		//case (b_addr)
+			//3'b000:	bram_reg_b <= 32'h00000000;
+			//3'b001:	bram_reg_b <= 32'h00000000;
+			//3'b010:	bram_reg_b <= 32'h00000000;
+			//3'b011:	bram_reg_b <= 32'h00000000;
+			//3'b100:	bram_reg_b <= 32'h00000000;
+			//3'b101:	bram_reg_b <= 32'h00000000;
+			//3'b110:	bram_reg_b <= 32'h00000000;
+			//3'b111:	bram_reg_b <= 32'h00000000;
+		//endcase
+
+
+endmodule
diff --git a/rtl/curve/uop/uop_add_rom.v b/rtl/curve/uop/uop_add_rom.v
new file mode 100644
index 0000000..c807736
--- /dev/null
+++ b/rtl/curve/uop/uop_add_rom.v
@@ -0,0 +1,66 @@
+`timescale 1ns / 1ps
+
+module uop_add_rom
+	(
+		input		wire				clk,
+		input		wire	[ 5: 0]	addr,
+		output	reg	[19: 0]	data
+    );
+
+
+		//
+		// Microcode
+		//
+`include "..\uop_ecdsa.v"
+
+
+   	//
+		// Addition Microprogram
+		//
+   always @(posedge clk)
+
+		case (addr)
+		
+/*  2. */6'd00:	data <= {OPCODE_CMP, UOP_SRC_PZ,    UOP_SRC_ZERO,  UOP_DST_DUMMY, UOP_EXEC_ALWAYS};
+/*  3. */6'd01:	data <= {OPCODE_MOV, UOP_SRC_PZ,    UOP_SRC_DUMMY, UOP_DST_T1,    UOP_EXEC_ALWAYS};
+			6'd02:	data <= {OPCODE_MUL, UOP_SRC_PZ,    UOP_SRC_T1,    UOP_DST_T1,    UOP_EXEC_ALWAYS};
+/*  4. */6'd03:	data <= {OPCODE_MUL, UOP_SRC_PZ,    UOP_SRC_T1,    UOP_DST_T2,    UOP_EXEC_ALWAYS};
+/*  5. */6'd04:	data <= {OPCODE_MUL, UOP_SRC_T1,    UOP_SRC_G_X,   UOP_DST_T1,    UOP_EXEC_ALWAYS};
+/*  6. */6'd05:	data <= {OPCODE_MUL, UOP_SRC_T2,    UOP_SRC_G_Y,   UOP_DST_T2,    UOP_EXEC_ALWAYS};
+/*  7. */6'd06:	data <= {OPCODE_SUB, UOP_SRC_T1,    UOP_SRC_PX,    UOP_DST_T1,    UOP_EXEC_ALWAYS};
+/*  8. */6'd07:	data <= {OPCODE_SUB, UOP_SRC_T2,    UOP_SRC_PY,    UOP_DST_T2,    UOP_EXEC_ALWAYS};
+/*  9. */6'd08:	data <= {OPCODE_CMP, UOP_SRC_T1,    UOP_SRC_ZERO,  UOP_DST_DUMMY, UOP_EXEC_ALWAYS};
+			6'd09:	data <= {OPCODE_CMP, UOP_SRC_T2,    UOP_SRC_ZERO,  UOP_DST_DUMMY, UOP_EXEC_ALWAYS};
+/* 10. */6'd10:	data <= {OPCODE_MUL, UOP_SRC_PZ,    UOP_SRC_T1,    UOP_DST_RZ,    UOP_EXEC_ALWAYS};
+/* 11. */6'd11:	data <= {OPCODE_MOV, UOP_SRC_T1,    UOP_SRC_DUMMY, UOP_DST_T3,    UOP_EXEC_ALWAYS};
+			6'd12:	data <= {OPCODE_MUL, UOP_SRC_T1,    UOP_SRC_T3,    UOP_DST_T3,    UOP_EXEC_ALWAYS};
+/* 12. */6'd13:	data <= {OPCODE_MUL, UOP_SRC_T1,    UOP_SRC_T3,    UOP_DST_T4,    UOP_EXEC_ALWAYS};
+/* 13. */6'd14:	data <= {OPCODE_MUL, UOP_SRC_PX,    UOP_SRC_T3,    UOP_DST_T3,    UOP_EXEC_ALWAYS};
+/* 14. */6'd15:	data <= {OPCODE_ADD, UOP_SRC_T3,    UOP_SRC_T3,    UOP_DST_T1,    UOP_EXEC_ALWAYS};
+/* 15. */6'd16:	data <= {OPCODE_MOV, UOP_SRC_T2,    UOP_SRC_DUMMY, UOP_DST_RX,    UOP_EXEC_ALWAYS};
+			6'd17:	data <= {OPCODE_MUL, UOP_SRC_RX,    UOP_SRC_T2,    UOP_DST_RX,    UOP_EXEC_ALWAYS};
+/* 16. */6'd18:	data <= {OPCODE_SUB, UOP_SRC_RX,    UOP_SRC_T1,    UOP_DST_RX,    UOP_EXEC_ALWAYS};
+/* 17. */6'd19:	data <= {OPCODE_SUB, UOP_SRC_RX,    UOP_SRC_T4,    UOP_DST_RX,    UOP_EXEC_ALWAYS};
+/* 18. */6'd20:	data <= {OPCODE_SUB, UOP_SRC_T3,    UOP_SRC_RX,    UOP_DST_T3,    UOP_EXEC_ALWAYS};
+/* 19. */6'd21:	data <= {OPCODE_MUL, UOP_SRC_T2,    UOP_SRC_T3,    UOP_DST_T3,    UOP_EXEC_ALWAYS};
+/* 20. */6'd22:	data <= {OPCODE_MUL, UOP_SRC_PY,    UOP_SRC_T4,    UOP_DST_T4,    UOP_EXEC_ALWAYS};
+/* 21. */6'd23:	data <= {OPCODE_SUB, UOP_SRC_T3,    UOP_SRC_T4,    UOP_DST_RY,    UOP_EXEC_ALWAYS};
+
+			6'd24:	data <= {OPCODE_MOV, UOP_SRC_G_X,   UOP_SRC_DUMMY, UOP_DST_RX,    UOP_EXEC_PZT1T2_0XX};
+			6'd25:	data <= {OPCODE_MOV, UOP_SRC_G_Y,   UOP_SRC_DUMMY, UOP_DST_RY,    UOP_EXEC_PZT1T2_0XX};
+			6'd26:	data <= {OPCODE_MOV, UOP_SRC_ONE,   UOP_SRC_DUMMY, UOP_DST_RZ,    UOP_EXEC_PZT1T2_0XX};
+			
+			6'd27:	data <= {OPCODE_MOV, UOP_SRC_H_X,   UOP_SRC_DUMMY, UOP_DST_RX,    UOP_EXEC_PZT1T2_100};
+			6'd28:	data <= {OPCODE_MOV, UOP_SRC_H_Y,   UOP_SRC_DUMMY, UOP_DST_RY,    UOP_EXEC_PZT1T2_100};
+			6'd29:	data <= {OPCODE_MOV, UOP_SRC_ONE,   UOP_SRC_DUMMY, UOP_DST_RZ,    UOP_EXEC_PZT1T2_100};
+			
+			6'd30:	data <= {OPCODE_MOV, UOP_SRC_ONE,   UOP_SRC_DUMMY, UOP_DST_RX,    UOP_EXEC_PZT1T2_101};
+			6'd31:	data <= {OPCODE_MOV, UOP_SRC_ONE,   UOP_SRC_DUMMY, UOP_DST_RY,    UOP_EXEC_PZT1T2_101};
+			6'd32:	data <= {OPCODE_MOV, UOP_SRC_ZERO,  UOP_SRC_DUMMY, UOP_DST_RZ,    UOP_EXEC_PZT1T2_101};
+			
+			default:	data <= {OPCODE_RDY, UOP_SRC_DUMMY, UOP_SRC_DUMMY, UOP_DST_DUMMY};
+			
+		endcase
+		
+
+endmodule
diff --git a/rtl/curve/uop/uop_conv_rom.v b/rtl/curve/uop/uop_conv_rom.v
new file mode 100644
index 0000000..3097736
--- /dev/null
+++ b/rtl/curve/uop/uop_conv_rom.v
@@ -0,0 +1,38 @@
+`timescale 1ns / 1ps
+
+module uop_conv_rom
+	(
+		input		wire				clk,
+		input		wire	[ 5: 0]	addr,
+		output	reg	[19: 0]	data
+    );
+
+
+		//
+		// Microcode
+		//
+`include "..\uop_ecdsa.v"
+
+
+   	//
+		// Doubling Microprogram
+		//
+   always @(posedge clk)
+
+		case (addr)
+		
+			6'd00:	data <= {OPCODE_CMP, UOP_SRC_PZ,   UOP_SRC_ZERO,  UOP_DST_DUMMY, UOP_EXEC_ALWAYS};
+			6'd01:	data <= {OPCODE_MOV, UOP_SRC_V,    UOP_SRC_DUMMY, UOP_DST_T1,    UOP_EXEC_ALWAYS};
+			6'd02:	data <= {OPCODE_MUL, UOP_SRC_V,    UOP_SRC_T1,    UOP_DST_T2,    UOP_EXEC_ALWAYS};
+			6'd03:	data <= {OPCODE_MUL, UOP_SRC_V,    UOP_SRC_T2,    UOP_DST_T3,    UOP_EXEC_ALWAYS};
+			6'd04:	data <= {OPCODE_MUL, UOP_SRC_PX,   UOP_SRC_T2,    UOP_DST_RX,    UOP_EXEC_ALWAYS};
+			6'd05:	data <= {OPCODE_MUL, UOP_SRC_PY,   UOP_SRC_T3,    UOP_DST_RY,    UOP_EXEC_ALWAYS};
+			6'd06:	data <= {OPCODE_MOV, UOP_SRC_ZERO, UOP_SRC_DUMMY, UOP_DST_RX,    UOP_EXEC_PZT1T2_0XX};
+			6'd07:	data <= {OPCODE_MOV, UOP_SRC_ZERO, UOP_SRC_DUMMY, UOP_DST_RY,    UOP_EXEC_PZT1T2_0XX};
+			
+			default:	data <= {OPCODE_RDY, UOP_SRC_DUMMY, UOP_SRC_DUMMY, UOP_DST_DUMMY};
+			
+		endcase
+		
+
+endmodule
diff --git a/rtl/curve/uop/uop_dbl_rom.v b/rtl/curve/uop/uop_dbl_rom.v
new file mode 100644
index 0000000..1939ca9
--- /dev/null
+++ b/rtl/curve/uop/uop_dbl_rom.v
@@ -0,0 +1,58 @@
+`timescale 1ns / 1ps
+
+module uop_dbl_rom
+	(
+		input		wire				clk,
+		input		wire	[ 5: 0]	addr,
+		output	reg	[19: 0]	data
+    );
+
+
+		//
+		// Microcode
+		//
+`include "..\uop_ecdsa.v"
+
+
+   	//
+		// Doubling Microprogram
+		//
+   always @(posedge clk)
+
+		case (addr)
+		
+/*  1. */6'd00:	data <= {OPCODE_CMP, UOP_SRC_PZ,    UOP_SRC_ZERO,  UOP_DST_DUMMY, UOP_EXEC_ALWAYS};
+/*  2. */6'd01:	data <= {OPCODE_MOV, UOP_SRC_PZ,    UOP_SRC_DUMMY, UOP_DST_T1,    UOP_EXEC_ALWAYS};
+			5'd02:	data <= {OPCODE_MUL, UOP_SRC_PZ,    UOP_SRC_T1,    UOP_DST_T1,    UOP_EXEC_ALWAYS};
+/*  3. */6'd03:	data <= {OPCODE_SUB, UOP_SRC_PX,    UOP_SRC_T1,    UOP_DST_T2,    UOP_EXEC_ALWAYS};
+/*  4. */6'd04:	data <= {OPCODE_ADD, UOP_SRC_PX,    UOP_SRC_T1,    UOP_DST_T1,    UOP_EXEC_ALWAYS};
+/*  5. */6'd05:	data <= {OPCODE_MUL, UOP_SRC_T1,    UOP_SRC_T2,    UOP_DST_T2,    UOP_EXEC_ALWAYS};
+/*  6. */6'd06:	data <= {OPCODE_ADD, UOP_SRC_T2,    UOP_SRC_T2,    UOP_DST_T1,    UOP_EXEC_ALWAYS};
+         6'd07:	data <= {OPCODE_ADD, UOP_SRC_T1,    UOP_SRC_T2,    UOP_DST_T2,    UOP_EXEC_ALWAYS};
+/*  7. */6'd08:	data <= {OPCODE_ADD, UOP_SRC_PY,    UOP_SRC_PY,    UOP_DST_RY,    UOP_EXEC_ALWAYS};
+/*  8. */6'd09:	data <= {OPCODE_MUL, UOP_SRC_PZ,    UOP_SRC_RY,    UOP_DST_RZ,    UOP_EXEC_ALWAYS};
+/*  9. */6'd10:	data <= {OPCODE_MOV, UOP_SRC_RY,    UOP_SRC_DUMMY, UOP_DST_T1,    UOP_EXEC_ALWAYS};
+         6'd11:	data <= {OPCODE_MOV, UOP_SRC_RY,    UOP_SRC_DUMMY, UOP_DST_T3,    UOP_EXEC_ALWAYS};
+         6'd12:	data <= {OPCODE_MUL, UOP_SRC_T1,    UOP_SRC_T3,    UOP_DST_RY,    UOP_EXEC_ALWAYS};
+/* 10. */6'd13:	data <= {OPCODE_MUL, UOP_SRC_PX,    UOP_SRC_RY,    UOP_DST_T3,    UOP_EXEC_ALWAYS};
+/* 11. */6'd14:	data <= {OPCODE_MOV, UOP_SRC_RY,    UOP_SRC_DUMMY, UOP_DST_T1,    UOP_EXEC_ALWAYS};
+         6'd15:	data <= {OPCODE_MUL, UOP_SRC_RY,    UOP_SRC_T1,    UOP_DST_T1,    UOP_EXEC_ALWAYS};
+/* 12. */6'd16:	data <= {OPCODE_MUL, UOP_SRC_T1,    UOP_SRC_DELTA, UOP_DST_RY,    UOP_EXEC_ALWAYS};
+/* 13. */6'd17:	data <= {OPCODE_MOV, UOP_SRC_T2,    UOP_SRC_DUMMY, UOP_DST_T1,    UOP_EXEC_ALWAYS};
+         6'd18:	data <= {OPCODE_MUL, UOP_SRC_T1,    UOP_SRC_T2,    UOP_DST_RX,    UOP_EXEC_ALWAYS};
+/* 14. */6'd19:	data <= {OPCODE_ADD, UOP_SRC_T3,    UOP_SRC_T3,    UOP_DST_T1,    UOP_EXEC_ALWAYS};
+/* 15. */6'd20:	data <= {OPCODE_SUB, UOP_SRC_RX,    UOP_SRC_T1,    UOP_DST_RX,    UOP_EXEC_ALWAYS};
+/* 16. */6'd21:	data <= {OPCODE_SUB, UOP_SRC_T3,    UOP_SRC_RX,    UOP_DST_T1,    UOP_EXEC_ALWAYS};	
+/* 17. */6'd22:	data <= {OPCODE_MUL, UOP_SRC_T1,    UOP_SRC_T2,    UOP_DST_T1,    UOP_EXEC_ALWAYS};
+/* 18. */6'd23:	data <= {OPCODE_SUB, UOP_SRC_T1,    UOP_SRC_RY,    UOP_DST_RY,    UOP_EXEC_ALWAYS};
+
+			6'd24:	data <= {OPCODE_MOV, UOP_SRC_ONE,   UOP_SRC_DUMMY, UOP_DST_RX,    UOP_EXEC_PZT1T2_0XX};
+			6'd25:	data <= {OPCODE_MOV, UOP_SRC_ONE,   UOP_SRC_DUMMY, UOP_DST_RY,    UOP_EXEC_PZT1T2_0XX};
+			6'd26:	data <= {OPCODE_MOV, UOP_SRC_ZERO,  UOP_SRC_DUMMY, UOP_DST_RZ,    UOP_EXEC_PZT1T2_0XX};
+			
+			default:	data <= {OPCODE_RDY, UOP_SRC_DUMMY, UOP_SRC_DUMMY, UOP_DST_DUMMY};
+			
+		endcase
+		
+
+endmodule
diff --git a/rtl/curve/uop/uop_init_rom.v b/rtl/curve/uop/uop_init_rom.v
new file mode 100644
index 0000000..ac44b55
--- /dev/null
+++ b/rtl/curve/uop/uop_init_rom.v
@@ -0,0 +1,33 @@
+`timescale 1ns / 1ps
+
+module uop_init_rom
+	(
+		input		wire				clk,
+		input		wire	[ 5: 0]	addr,
+		output	reg	[19: 0]	data
+    );
+
+
+		//
+		// Microcode
+		//
+`include "..\uop_ecdsa.v"
+
+
+   	//
+		// Doubling Microprogram
+		//
+   always @(posedge clk)
+
+		case (addr)
+		
+			6'd00:	data <= {OPCODE_MOV, UOP_SRC_ONE,  UOP_SRC_DUMMY, UOP_DST_RX, UOP_EXEC_ALWAYS};
+			6'd01:	data <= {OPCODE_MOV, UOP_SRC_ONE,  UOP_SRC_DUMMY, UOP_DST_RY, UOP_EXEC_ALWAYS};
+			6'd02:	data <= {OPCODE_MOV, UOP_SRC_ZERO, UOP_SRC_DUMMY, UOP_DST_RZ, UOP_EXEC_ALWAYS};
+			
+			default:	data <= {OPCODE_RDY, UOP_SRC_DUMMY, UOP_SRC_DUMMY, UOP_DST_DUMMY};
+			
+		endcase
+		
+
+endmodule
diff --git a/rtl/curve/uop_ecdsa.v b/rtl/curve/uop_ecdsa.v
new file mode 100644
index 0000000..e64119d
--- /dev/null
+++ b/rtl/curve/uop_ecdsa.v
@@ -0,0 +1,50 @@
+localparam	[ 4: 0]	OPCODE_CMP			= 5'b00001;
+localparam	[ 4: 0]	OPCODE_MOV			= 5'b00010;
+localparam	[ 4: 0]	OPCODE_ADD			= 5'b00100;
+localparam	[ 4: 0]	OPCODE_SUB			= 5'b01000;
+localparam	[ 4: 0]	OPCODE_MUL			= 5'b10000;
+localparam	[ 4: 0]	OPCODE_RDY			= 5'b00000;
+
+localparam	[ 4: 0]	UOP_SRC_PX			= 5'h0_0;
+localparam	[ 4: 0]	UOP_SRC_PY			= 5'h0_1;
+localparam	[ 4: 0]	UOP_SRC_PZ			= 5'h0_2;
+
+localparam	[ 4: 0]	UOP_SRC_RX			= 5'h0_3;
+localparam	[ 4: 0]	UOP_SRC_RY			= 5'h0_4;
+localparam	[ 4: 0]	UOP_SRC_RZ			= 5'h0_5;
+
+localparam	[ 4: 0]	UOP_SRC_T1			= 5'h0_6;
+localparam	[ 4: 0]	UOP_SRC_T2			= 5'h0_7;
+localparam	[ 4: 0]	UOP_SRC_T3			= 5'h0_8;
+localparam	[ 4: 0]	UOP_SRC_T4			= 5'h0_9;
+
+localparam	[ 4: 0]	UOP_SRC_ONE			= 5'h0_A;
+localparam	[ 4: 0]	UOP_SRC_ZERO		= 5'h0_B;
+localparam	[ 4: 0]	UOP_SRC_DELTA		= 5'h0_C;
+
+localparam	[ 4: 0]	UOP_SRC_V			= 5'h0_F;
+
+localparam	[ 4: 0]	UOP_SRC_G_X			= 5'h1_0;
+localparam	[ 4: 0]	UOP_SRC_G_Y			= 5'h1_1;
+
+localparam	[ 4: 0]	UOP_SRC_H_X			= 5'h1_2;
+localparam	[ 4: 0]	UOP_SRC_H_Y			= 5'h1_3;
+
+localparam	[ 4: 0]	UOP_SRC_DUMMY		= 5'hX_X;
+
+localparam	[ 2: 0]	UOP_DST_RX			= 3'd0;
+localparam	[ 2: 0]	UOP_DST_RY			= 3'd1;
+localparam	[ 2: 0]	UOP_DST_RZ			= 3'd2;
+
+localparam	[ 2: 0]	UOP_DST_T1			= 3'd3;
+localparam	[ 2: 0]	UOP_DST_T2			= 3'd4;
+localparam	[ 2: 0]	UOP_DST_T3			= 3'd5;
+localparam	[ 2: 0]	UOP_DST_T4			= 3'd6;
+
+localparam	[ 2: 0]	UOP_DST_DUMMY		= 3'dX;
+
+localparam				UOP_EXEC_ALWAYS		= 2'b11;	// R
+localparam				UOP_EXEC_PZT1T2_0XX	= 2'b10;	// G
+localparam				UOP_EXEC_PZT1T2_100	= 2'b00;	// H
+localparam				UOP_EXEC_PZT1T2_101	= 2'b01;	// O
+
diff --git a/rtl/ecdsa256.v b/rtl/ecdsa256.v
new file mode 100644
index 0000000..86e22e5
--- /dev/null
+++ b/rtl/ecdsa256.v
@@ -0,0 +1,160 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module ecdsa256
+  (
+   input  wire        clk,
+	input  wire        rst_n,
+
+   input  wire        next,
+   output wire        valid,
+
+   input  wire        bus_cs,
+   input  wire        bus_we,
+   input  wire [ 4:0] bus_addr,
+   input  wire [31:0] bus_data_wr,
+   output wire [31:0] bus_data_rd
+   );
+
+
+	//
+	// Memory Banks
+	//
+	localparam [1:0] BUS_ADDR_BANK_K = 2'b00;
+	localparam [1:0] BUS_ADDR_BANK_X = 2'b01;
+	localparam [1:0] BUS_ADDR_BANK_Y = 2'b10;
+
+	wire [1:0] bus_addr_upper = bus_addr[4:3];
+	wire [2:0] bus_addr_lower = bus_addr[2:0];
+	
+	
+   //
+   // Memories
+   //
+	
+   wire [31:0] user_rw_k_bram_out;
+   wire [31:0] user_ro_x_bram_out;
+   wire [31:0] user_ro_y_bram_out;
+	
+	wire [ 2:0] core_ro_k_bram_addr;
+	wire [ 2:0] core_rw_x_bram_addr;
+	wire [ 2:0] core_rw_y_bram_addr;
+	
+	wire        core_rw_x_bram_wren;
+	wire        core_rw_y_bram_wren;
+	
+	wire [31:0] core_ro_k_bram_dout;
+	wire [31:0] core_rw_x_bram_din;
+	wire [31:0] core_rw_y_bram_din;
+
+	
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(3)
+	)
+	bram_k
+	(	.clk(clk),
+		.a_addr(bus_addr_lower), .a_out(user_rw_k_bram_out), .a_wr(bus_cs && bus_we && (bus_addr_upper == BUS_ADDR_BANK_K)), .a_in(bus_data_wr), 
+		.b_addr(core_ro_k_bram_addr), .b_out(core_ro_k_bram_dout)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(3)
+	)
+	bram_x
+	(	.clk(clk),
+		.a_addr(core_rw_x_bram_addr), .a_out(), .a_wr(core_rw_x_bram_wren), .a_in(core_rw_x_bram_din), 
+		.b_addr(bus_addr_lower),      .b_out(user_ro_x_bram_out)
+	);
+	
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(3)
+	)
+	bram_y
+	(	.clk(clk),
+		.a_addr(core_rw_y_bram_addr), .a_out(), .a_wr(core_rw_y_bram_wren), .a_in(core_rw_y_bram_din), 
+		.b_addr(bus_addr_lower),      .b_out(user_ro_y_bram_out)
+	);
+
+
+   //
+   // Montgomery Coefficient Calculator
+   //
+	reg  next_dly;
+	
+	always @(posedge clk) next_dly <= next;
+	
+	wire next_trig = next && !next_dly;
+	
+   curve_mul_256 base_point_multiplier_p256
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(next_trig),
+		.rdy		(valid),
+		
+		.k_addr	(core_ro_k_bram_addr),
+		.rx_addr	(core_rw_x_bram_addr),
+		.ry_addr	(core_rw_y_bram_addr),
+		
+		.rx_wren	(core_rw_x_bram_wren),
+		.ry_wren	(core_rw_y_bram_wren),
+		
+		.k_din	(core_ro_k_bram_dout),
+		.rx_dout	(core_rw_x_bram_din),
+		.ry_dout	(core_rw_y_bram_din)
+	);
+
+	//
+   // Output Selector
+   //
+   reg [1:0] bus_addr_upper_prev;
+   always @(posedge clk) bus_addr_upper_prev = bus_addr_upper;
+
+   reg [31: 0] bus_data_rd_mux;
+   assign bus_data_rd = bus_data_rd_mux;
+
+   always @(*)
+     //
+     case (bus_addr_upper_prev)
+       //
+       BUS_ADDR_BANK_K: bus_data_rd_mux = user_rw_k_bram_out;
+       BUS_ADDR_BANK_X: bus_data_rd_mux = user_ro_x_bram_out;
+       BUS_ADDR_BANK_Y: bus_data_rd_mux = user_ro_y_bram_out;
+       //
+       default:         bus_data_rd_mux = {32{1'b0}};
+       //
+     endcase
+
+endmodule
diff --git a/rtl/ecdsa256_wrapper.v b/rtl/ecdsa256_wrapper.v
new file mode 100644
index 0000000..74f2cbe
--- /dev/null
+++ b/rtl/ecdsa256_wrapper.v
@@ -0,0 +1,177 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+module ecdsa256_wrapper
+  (
+   input wire          clk,
+   input wire          rst_n,
+
+   input wire          cs,
+   input wire          we,
+
+   input wire [5: 0]   address,
+   input wire [31: 0]  write_data,
+   output wire [31: 0] read_data
+   );
+
+
+   //
+   // Address Decoder
+   //
+   localparam ADDR_MSB_REGS = 1'b0;
+	localparam ADDR_MSB_CORE = 1'b1;
+	
+   wire [0:0] addr_msb = address[5];
+   wire [4:0] addr_lsb = address[4:0];
+
+
+   //
+   // Output Mux
+   //
+   wire [31: 0] read_data_regs;
+   wire [31: 0] read_data_core;
+
+
+   //
+   // Registers
+   //
+   localparam ADDR_NAME0        = 5'h00;
+   localparam ADDR_NAME1        = 5'h01;
+   localparam ADDR_VERSION      = 5'h02;
+
+   localparam ADDR_CONTROL      = 5'h08;               // {next, init}
+   localparam ADDR_STATUS       = 5'h09;               // {valid, ready}
+	localparam ADDR_DUMMY        = 5'h0F;               // don't care
+
+// localparam CONTROL_INIT_BIT  = 0; -- not used
+   localparam CONTROL_NEXT_BIT  = 1;
+
+   localparam STATUS_READY_BIT  = 0;
+// localparam STATUS_VALID_BIT  = 1; -- hardcoded to always read 1
+
+   localparam CORE_NAME0        = 32'h65636473; // "ecds"
+   localparam CORE_NAME1        = 32'h61323536; // "a256"
+   localparam CORE_VERSION      = 32'h302E3130; // "0.10"
+
+
+   //
+   // Registers
+   //
+   reg        reg_control;
+	reg [31:0] reg_dummy;
+
+
+   //
+   // Wires
+   //
+   wire reg_status;
+
+
+   //
+   // ECDSA256
+   //
+   ecdsa256 ecdsa256_inst
+	(
+      .clk                      (clk),
+		.rst_n                    (rst_n),
+
+      .next                     (reg_control),
+      .valid                    (reg_status),
+
+      .bus_cs                   (cs && (addr_msb == ADDR_MSB_CORE)),
+      .bus_we                   (we),
+      .bus_addr                 (addr_lsb),
+      .bus_data_wr              (write_data),
+      .bus_data_rd              (read_data_core)
+	);
+
+
+   //
+   // Read Latch
+   //
+   reg [31: 0]         tmp_read_data;
+
+
+   //
+   // Read/Write Interface
+   //
+   always @(posedge clk)
+     //
+     if (!rst_n) begin
+        //
+        reg_control <= 1'b0;
+        //
+     end else if (cs && (addr_msb == ADDR_MSB_REGS)) begin
+        //
+        if (we) begin
+           //
+           // Write Handler
+           //
+           case (addr_lsb)
+             //
+             ADDR_CONTROL: reg_control <= write_data[1];
+				 ADDR_DUMMY:   reg_dummy   <= write_data[31:0];
+             //
+           endcase
+           //
+        end else begin
+           //
+           // Read Handler
+           //
+           case (address)
+             //
+             ADDR_NAME0:        tmp_read_data <= CORE_NAME0;
+             ADDR_NAME1:        tmp_read_data <= CORE_NAME1;
+             ADDR_VERSION:      tmp_read_data <= CORE_VERSION;
+             ADDR_CONTROL:      tmp_read_data <= {{30{1'b0}}, reg_control, 1'b0};
+             ADDR_STATUS:       tmp_read_data <= {{30{1'b0}}, reg_status,  1'b1};
+				 ADDR_DUMMY:        tmp_read_data <= reg_dummy;
+             //
+             default:           tmp_read_data <= 32'h00000000;
+             //
+           endcase
+           //
+        end
+        //
+     end
+
+
+   //
+   // Register / Core Memory Selector
+   //
+   reg addr_msb_last;
+   always @(posedge clk) addr_msb_last = addr_msb;
+
+   assign read_data = (addr_msb_last == ADDR_MSB_REGS) ? tmp_read_data : read_data_core;
+
+
+endmodule
diff --git a/rtl/lowlevel/adder32_wrapper.v b/rtl/lowlevel/adder32_wrapper.v
new file mode 100644
index 0000000..ebfd8ce
--- /dev/null
+++ b/rtl/lowlevel/adder32_wrapper.v
@@ -0,0 +1,73 @@
+//------------------------------------------------------------------------------
+//
+// adder32_wrapper.v
+// -----------------------------------------------------------------------------
+// Wrapper for 32-bit adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module adder32_wrapper
+	(
+		input					clk,		// clock
+		input		[31: 0]	a,			// operand input
+		input		[31: 0]	b,			// operand input
+		output	[31: 0]	s,			// sum output
+		input					c_in,		// carry input
+		output				c_out		// carry output
+	);
+	
+		//
+		// Include Primitive Selector
+		//
+`include "ecdsa_lowlevel_settings.v"
+
+
+		//
+		// Instantiate Vendor/Generic Primitive
+		//
+	`ADDER32_PRIMITIVE adder32_inst
+	(
+		.clk(clk),
+		.a(a),
+		.b(b),
+		.s(s),
+		.c_in(c_in),
+		.c_out(c_out)
+	);
+	
+
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/lowlevel/adder47_wrapper.v b/rtl/lowlevel/adder47_wrapper.v
new file mode 100644
index 0000000..1a0a18e
--- /dev/null
+++ b/rtl/lowlevel/adder47_wrapper.v
@@ -0,0 +1,69 @@
+//------------------------------------------------------------------------------
+//
+// adder47_wrapper.v
+// -----------------------------------------------------------------------------
+// Wrapper for 47-bit adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module adder47_wrapper
+	(
+		input					clk,		// clock
+		input		[46: 0]	a,			// operand input
+		input		[46: 0]	b,			// operand input
+		output	[46: 0]	s			// sum output
+	);
+	
+		//
+		// Include Primitive Selector
+		//
+`include "ecdsa_lowlevel_settings.v"
+
+
+		//
+		// Instantiate Vendor/Generic Primitive
+		//
+	`ADDER47_PRIMITIVE adder47_inst
+	(
+		.clk(clk),
+		.a(a),
+		.b(b),
+		.s(s)
+	);
+	
+	
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/lowlevel/artix7/adder32_artix7.v b/rtl/lowlevel/artix7/adder32_artix7.v
new file mode 100644
index 0000000..5f9ba79
--- /dev/null
+++ b/rtl/lowlevel/artix7/adder32_artix7.v
@@ -0,0 +1,96 @@
+//------------------------------------------------------------------------------
+//
+// adder32_artix7.v
+// -----------------------------------------------------------------------------
+// Hardware (Artix-7 DSP48E1) 32-bit adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module adder32_artix7
+	(
+		input					clk,		// clock
+		input		[31: 0]	a,			// operand input
+		input		[31: 0]	b,			// operand input
+		output	[31: 0]	s,			// sum output
+		input					c_in,		// carry input
+		output				c_out		// carry output
+	);
+	
+		//
+		// Lower and higher parts of operand
+		//
+	wire	[17: 0]	bl = b[17: 0];
+	wire	[13: 0]	bh = b[31:18];
+
+		
+		//
+		// DSP48E1 Slice
+		//
+		
+		/* Operation Mode */
+	wire	[ 3: 0]	dsp48e1_alumode	= 4'b0000;
+	wire	[ 6: 0]	dsp48e1_opmode		= 7'b0110011;
+		
+		/* Internal Product */
+	wire	[47: 0]	p_int;
+
+	dsp48e1_wrapper dsp_adder
+	(
+		.clk			(clk),
+		
+		.ce			(1'b1),
+		
+		.carry		(c_in),
+		
+		.alumode		(dsp48e1_alumode),
+		.opmode		(dsp48e1_opmode),
+		
+		.a				({{16{1'b0}}, bh}),
+		.b				(bl),
+		.c				({{16{1'b0}}, a}),
+		
+		.p				(p_int)
+	);
+
+		//
+		// Output Mapping
+		//
+	assign s 		= p_int[31: 0];
+	assign c_out	= p_int[32];
+
+
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/lowlevel/artix7/adder47_artix7.v b/rtl/lowlevel/artix7/adder47_artix7.v
new file mode 100644
index 0000000..00566e4
--- /dev/null
+++ b/rtl/lowlevel/artix7/adder47_artix7.v
@@ -0,0 +1,91 @@
+//------------------------------------------------------------------------------
+//
+// adder47_artix7.v
+// -----------------------------------------------------------------------------
+// Hardware (Artix-7 DSP48E1) 47-bit adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module adder47_artix7
+	(
+		input					clk,		// clock
+		input		[46: 0]	a,			// operand input
+		input		[46: 0]	b,			// operand input
+		output	[46: 0]	s			// sum output
+	);
+	
+		//
+		// Lower and higher parts of operand
+		//
+	wire	[17: 0]	bl = b[17: 0];
+	wire	[28: 0]	bh = b[46:18];
+
+		//
+		// DSP48E1 Slice
+		//
+		
+		/* Operation Mode */
+	wire	[ 3: 0]	dsp48e1_alumode	= 4'b0000;
+	wire	[ 6: 0]	dsp48e1_opmode		= 7'b0110011;
+		
+		/* Internal Product */
+	wire	[47: 0]	p_int;
+
+	dsp48e1_wrapper dsp_adder
+	(
+		.clk			(clk),
+		
+		.ce			(1'b1),
+		
+		.carry		(1'b0),
+		
+		.alumode		(dsp48e1_alumode),
+		.opmode		(dsp48e1_opmode),
+		
+		.a				({1'b0, bh}),
+		.b				(bl),
+		.c				({1'b0, a}),
+		
+		.p				(p_int)
+	);
+
+		//
+		// Output Mapping
+		//
+	assign s 		= p_int[46: 0];
+
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/lowlevel/artix7/dsp48e1_wrapper.v b/rtl/lowlevel/artix7/dsp48e1_wrapper.v
new file mode 100644
index 0000000..9f29ac1
--- /dev/null
+++ b/rtl/lowlevel/artix7/dsp48e1_wrapper.v
@@ -0,0 +1,159 @@
+//------------------------------------------------------------------------------
+//
+// dsp48e1_wrapper.v
+// -----------------------------------------------------------------------------
+// Hardware (Artix-7 DSP48E1) tile wrapper.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module dsp48e1_wrapper
+	(
+		input					clk,
+		
+		input					ce,
+		
+		input		[ 6: 0]	opmode,
+		input		[ 3: 0]	alumode,
+		
+		input					carry,
+		
+		input		[29: 0]	a,
+		input		[17: 0]	b,
+		input		[47: 0]	c,
+		
+		output	[47: 0]	p
+	);
+	
+	
+		//
+		// Tile instantiation
+		//
+	DSP48E1 #
+	(
+		.AREG						(0),
+		.BREG						(0),
+		.CREG						(0),
+		.DREG						(1),
+		.MREG						(0),
+		.PREG						(1),
+		.ADREG					(0),
+		
+		.ACASCREG				(0),
+		.BCASCREG				(0),
+		.ALUMODEREG				(0),
+		.INMODEREG				(0),
+		.OPMODEREG				(0),
+		.CARRYINREG				(0),
+		.CARRYINSELREG			(0),
+
+		.A_INPUT					("DIRECT"),
+		.B_INPUT					("DIRECT"),
+		
+		.USE_DPORT				("FALSE"),
+		.USE_MULT				("DYNAMIC"),
+		.USE_SIMD				("ONE48"),
+
+		.USE_PATTERN_DETECT	("NO_PATDET"),
+		.SEL_PATTERN			("PATTERN"),
+		.SEL_MASK				("MASK"),
+		.PATTERN					(48'h000000000000),
+		.MASK						(48'h3fffffffffff),
+		.AUTORESET_PATDET		("NO_RESET")
+	)
+	DSP48E1_inst
+	(
+		.CLK					(clk),
+
+		.RSTA					(1'b0),
+		.RSTB					(1'b0),
+		.RSTC					(1'b0),
+		.RSTD					(1'b0),
+		.RSTM					(1'b0),
+		.RSTP					(1'b0),
+
+		.RSTCTRL				(1'b0),
+		.RSTINMODE			(1'b0),
+		.RSTALUMODE			(1'b0),
+		.RSTALLCARRYIN		(1'b0),
+
+		.CEA1					(1'b0),
+		.CEA2					(1'b0),
+		.CEB1					(1'b0),
+		.CEB2					(1'b0),
+		.CEC					(1'b0),
+		.CED					(1'b0),
+		.CEM					(1'b0),
+		.CEP					(ce),
+		.CEAD					(1'b0),
+		.CEALUMODE			(1'b0),
+		.CEINMODE			(1'b0),
+
+		.CECTRL				(1'b0),
+		.CECARRYIN			(1'b0),
+
+		.A						(a),
+		.B						(b),
+		.C						(c),
+		.D						({25{1'b1}}),
+		.P						(p),
+
+		.CARRYIN				(carry),
+		.CARRYOUT			(),
+		.CARRYINSEL			(3'b000),
+
+		.CARRYCASCIN		(1'b0),
+		.CARRYCASCOUT		(),
+
+		.PATTERNDETECT		(),
+		.PATTERNBDETECT	(),
+
+		.OPMODE				(opmode),
+		.ALUMODE				(alumode),
+		.INMODE				(5'b00000),
+
+		.MULTSIGNIN			(1'b0),
+		.MULTSIGNOUT		(),
+
+		.UNDERFLOW			(),
+		.OVERFLOW			(),
+
+		.ACIN					(30'd0),
+		.BCIN					(18'd0),
+		.PCIN					(48'd0),
+
+		.ACOUT				(),
+		.BCOUT				(),
+		.PCOUT				()
+  );
+
+endmodule
diff --git a/rtl/lowlevel/artix7/mac16_artix7.v b/rtl/lowlevel/artix7/mac16_artix7.v
new file mode 100644
index 0000000..09a2413
--- /dev/null
+++ b/rtl/lowlevel/artix7/mac16_artix7.v
@@ -0,0 +1,90 @@
+//------------------------------------------------------------------------------
+//
+// mac16_artix7.v
+// -----------------------------------------------------------------------------
+// Hardware (Artix-7 DSP48E1) 16-bit multiplier and 48-bit accumulator.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module mac16_artix7
+	(
+		input					clk,		// clock
+		input					clr,		// clear accumulator (active-high)
+		input					ce,		// enable clock (active-high)
+		input		[15: 0]	a,			// operand input
+		input		[15: 0]	b,			// operand input
+		output	[46: 0]	s			// sum output
+	);
+	
+			
+		//
+		// DSP48E1 Slice
+		//
+		
+		/* Operation Mode */
+	wire	[ 3: 0]	dsp48e1_alumode	= 4'b0000;
+	wire	[ 6: 0]	dsp48e1_opmode		= {2'b01, clr, 4'b0101};
+		
+		/* Internal Product */
+	wire	[47: 0]	p_int;
+
+	dsp48e1_wrapper dsp_adder
+	(
+		.clk			(clk),
+		
+		.ce			(ce),
+		
+		.carry		(1'b0),
+		
+		.alumode		(dsp48e1_alumode),
+		.opmode		(dsp48e1_opmode),
+		
+		.a				({{14{1'b0}}, a}),
+		.b				({{ 2{1'b0}}, b}),
+		.c				({48{1'b0}}),
+		
+		.p				(p_int)
+	);
+
+		//
+		// Output Mapping
+		//
+	assign s = p_int[46:0];
+	
+
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/lowlevel/artix7/subtractor32_artix7.v b/rtl/lowlevel/artix7/subtractor32_artix7.v
new file mode 100644
index 0000000..b46ac5c
--- /dev/null
+++ b/rtl/lowlevel/artix7/subtractor32_artix7.v
@@ -0,0 +1,94 @@
+//------------------------------------------------------------------------------
+//
+// subtractor32_artix7.v
+// -----------------------------------------------------------------------------
+// Hardware (Artix-7 DSP48E1) 32-bit subtractor.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module subtractor32_artix7
+	(
+		input					clk,
+		input		[31: 0]	a,
+		input		[31: 0]	b,
+		output	[31: 0]	d,
+		input					b_in,
+		output				b_out	
+	);
+
+		//
+		// Lower and higher parts of operand
+		//
+	wire	[17: 0]	bl = b[17: 0];
+	wire	[13: 0]	bh = b[31:18];
+	
+		//
+		// DSP48E1 Slice
+		//
+		
+		/* Operation Mode */
+	wire	[ 3: 0]	dsp48e1_alumode	= 4'b0011;
+	wire	[ 6: 0]	dsp48e1_opmode		= 7'b0110011;
+
+		/* Internal Product */	
+	wire	[47: 0]	p_int;
+	
+	dsp48e1_wrapper dsp_subtractor
+	(
+		.clk			(clk),
+	
+		.ce			(1'b1),
+		
+		.carry		(b_in),
+		
+		.alumode		(dsp48e1_alumode),
+		.opmode		(dsp48e1_opmode),
+		
+		.a				({{16{1'b0}}, bh}),
+		.b				(bl),
+		.c				({{16{1'b0}}, a}),
+		
+		.p				(p_int)
+	);
+
+		//
+		// Output Mapping
+		//
+	assign d 		= p_int[31: 0];
+	assign b_out	= p_int[32];
+
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/lowlevel/ecdsa_lowlevel_settings.v b/rtl/lowlevel/ecdsa_lowlevel_settings.v
new file mode 100644
index 0000000..8f95e2f
--- /dev/null
+++ b/rtl/lowlevel/ecdsa_lowlevel_settings.v
@@ -0,0 +1,17 @@
+`define USE_VENDOR_PRIMITIVES
+
+`ifdef USE_VENDOR_PRIMITIVES
+
+`define MAC16_PRIMITIVE				mac16_artix7
+`define ADDER32_PRIMITIVE			adder32_artix7
+`define ADDER47_PRIMITIVE			adder47_artix7
+`define SUBTRACTOR32_PRIMITIVE	subtractor32_artix7
+
+`else
+
+`define MAC16_PRIMITIVE				mac16_generic
+`define ADDER32_PRIMITIVE			adder32_generic
+`define ADDER47_PRIMITIVE			adder47_generic
+`define SUBTRACTOR32_PRIMITIVE	subtractor32_generic
+
+`endif
diff --git a/rtl/lowlevel/mac16_wrapper.v b/rtl/lowlevel/mac16_wrapper.v
new file mode 100644
index 0000000..b91e518
--- /dev/null
+++ b/rtl/lowlevel/mac16_wrapper.v
@@ -0,0 +1,75 @@
+//------------------------------------------------------------------------------
+//
+// mac16_wrapper.v
+// -----------------------------------------------------------------------------
+// Wrapper for 16-bit multiplier and 48-bit accumulator.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module mac16_wrapper
+	(
+		input					clk,		// clock
+		input					clr,		// clear accumulator (active-high)
+		input					ce,		// enable clock (active-high)
+		input		[15: 0]	a,			// operand input
+		input		[15: 0]	b,			// operand input
+		output	[46: 0]	s			// sum output
+	);
+	
+			
+		//
+		// Include Primitive Selector
+		//
+`include "ecdsa_lowlevel_settings.v"
+
+
+		//
+		// Instantiate Vendor/Generic Primitive
+		//
+	`MAC16_PRIMITIVE mac16_inst
+	(
+		.clk(clk),
+		.clr(clr),
+		.ce(ce),
+		.a(a),
+		.b(b),
+		.s(s)
+	);
+	
+
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/lowlevel/subtractor32_wrapper.v b/rtl/lowlevel/subtractor32_wrapper.v
new file mode 100644
index 0000000..3c7e5e9
--- /dev/null
+++ b/rtl/lowlevel/subtractor32_wrapper.v
@@ -0,0 +1,72 @@
+//------------------------------------------------------------------------------
+//
+// subtractor32_wrapper.v
+// -----------------------------------------------------------------------------
+// Wrapper for 32-bit subtractor.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module subtractor32_wrapper
+	(
+		input					clk,
+		input		[31: 0]	a,
+		input		[31: 0]	b,
+		output	[31: 0]	d,
+		input					b_in,
+		output				b_out	
+	);
+
+			//
+		// Include Primitive Selector
+		//
+`include "ecdsa_lowlevel_settings.v"
+
+
+		//
+		// Instantiate Vendor/Generic Primitive
+		//
+	`SUBTRACTOR32_PRIMITIVE subtractor32_inst
+	(
+		.clk(clk),
+		.a(a),
+		.b(b),
+		.d(d),
+		.b_in(b_in),
+		.b_out(b_out)
+	);
+
+endmodule
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/modular/modular_adder.v b/rtl/modular/modular_adder.v
new file mode 100644
index 0000000..5641feb
--- /dev/null
+++ b/rtl/modular/modular_adder.v
@@ -0,0 +1,298 @@
+//------------------------------------------------------------------------------
+//
+// modular_adder.v
+// -----------------------------------------------------------------------------
+// Modular adder.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module modular_adder
+	(
+		clk, rst_n,
+		ena, rdy,
+		ab_addr, n_addr, s_addr, s_wren,
+		a_din, b_din, n_din, s_dout
+	);
+
+
+		//
+		// Parameters
+		//
+	parameter	OPERAND_NUM_WORDS		= 8;
+	parameter	WORD_COUNTER_WIDTH	= 3;
+	
+	
+		//
+		// Handy Numbers
+		//
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_ZERO	= 0;
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_LAST	= OPERAND_NUM_WORDS - 1;
+	
+	
+		//
+		// Handy Functions
+		//
+	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_NEXT_OR_ZERO;
+		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
+		begin
+			WORD_INDEX_NEXT_OR_ZERO = (WORD_INDEX_CURRENT < WORD_INDEX_LAST) ?
+				WORD_INDEX_CURRENT + 1'b1 : WORD_INDEX_ZERO;
+		end
+	endfunction
+	
+	
+		//
+		// Ports
+		//
+	input		wire										clk;			// system clock
+	input		wire										rst_n;		// active-low async reset
+	
+	input		wire										ena;			// enable input
+	output	wire										rdy;			// ready output
+	
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	ab_addr;		// index of current A and B words
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	n_addr;		// index of current N word
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	s_addr;		// index of current S word
+	output	wire										s_wren;		// store current S word now
+	
+	input		wire	[                  31:0]	a_din;		// A
+	input		wire	[                  31:0]	b_din;		// B
+	input		wire	[                  31:0]	n_din;		// N
+	output	wire	[                  31:0]	s_dout;		// S = (A + B) mod N
+	
+	
+		//
+		// Word Indices
+		//
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_ab;
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_n;
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_s;
+
+		/* map registers to output ports */
+	assign ab_addr	= index_ab;
+	assign n_addr	= index_n;
+	assign s_addr	= index_s;
+
+
+		//
+		// Adder
+		//
+	wire	[31: 0]	add32_s;
+	wire				add32_c_in;
+	wire				add32_c_out;
+	
+	adder32_wrapper adder32
+	(
+		.clk		(clk),
+		.a			(a_din),
+		.b			(b_din),
+		.s			(add32_s),
+		.c_in		(add32_c_in),
+		.c_out	(add32_c_out)
+	);
+	
+	
+		//
+		// Subtractor
+		//
+	wire	[31: 0]	sub32_d;
+	wire				sub32_b_in;
+	wire				sub32_b_out;
+	
+	subtractor32_wrapper subtractor32
+	(
+		.clk		(clk),
+		.a			(add32_s),
+		.b			(n_din),
+		.d			(sub32_d),
+		.b_in		(sub32_b_in),
+		.b_out	(sub32_b_out)
+	);
+	
+	
+		//
+		// FSM
+		//
+		
+	localparam FSM_SHREG_WIDTH = 2*OPERAND_NUM_WORDS + 5;
+	
+	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
+	
+	assign rdy = fsm_shreg[0];
+	
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_ab	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 0)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_n		= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_sum_ab	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 3) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_sum_ab_n	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 3)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_data_s	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (2 * OPERAND_NUM_WORDS + 3)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_s		= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (2 * OPERAND_NUM_WORDS + 4)];
+	
+	wire fsm_latch_msb_carry	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
+	wire fsm_latch_msb_borrow	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 3)];
+	
+	wire inc_index_ab		= |fsm_shreg_inc_index_ab;
+	wire inc_index_n		= |fsm_shreg_inc_index_n;
+	wire store_sum_ab		= |fsm_shreg_store_sum_ab;
+	wire store_sum_ab_n	= |fsm_shreg_store_sum_ab_n;
+	wire store_data_s		= |fsm_shreg_store_data_s;
+	wire inc_index_s		= |fsm_shreg_inc_index_s;
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)
+			//
+			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
+			//
+		else begin
+			//
+			if (rdy)	fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
+			//
+			else		fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
+			//
+		end
+		
+		
+		
+		
+	
+	
+	
+		//
+		// Carry & Borrow Masking Logic
+		//
+	reg	add32_c_mask;
+	reg	sub32_b_mask;
+	
+	always @(posedge clk) begin
+		//
+		add32_c_mask <= (index_ab == WORD_INDEX_ZERO) ? 1'b1 : 1'b0;
+		sub32_b_mask <= (index_n  == WORD_INDEX_ZERO) ? 1'b1 : 1'b0;
+		//
+	end
+		
+	assign add32_c_in = add32_c_out & ~add32_c_mask;
+	assign sub32_b_in = sub32_b_out & ~sub32_b_mask;
+	
+	
+		//
+		// Carry & Borrow Latch Logic
+		//
+	reg add32_carry_latch;
+	reg sub32_borrow_latch;
+	
+	always @(posedge clk) begin
+		//
+		if (fsm_latch_msb_carry) add32_carry_latch <= add32_c_out;
+		if (fsm_latch_msb_borrow) sub32_borrow_latch <= sub32_b_out;
+		//
+	end
+
+		
+		//
+		// Intermediate Results
+		//
+	reg	[32*OPERAND_NUM_WORDS-1:0]		s_ab;
+	reg	[32*OPERAND_NUM_WORDS-1:0]		s_ab_n;
+	
+	always @(posedge clk)
+		//
+		if (store_data_s) begin
+			//
+			s_ab		<= {{32{1'bX}}, s_ab[32*OPERAND_NUM_WORDS-1:32]};
+			s_ab_n	<= {{32{1'bX}}, s_ab_n[32*OPERAND_NUM_WORDS-1:32]};		
+			//
+		end else begin
+			//
+			if (store_sum_ab) s_ab <= {add32_s, s_ab[32*OPERAND_NUM_WORDS-1:32]};
+			if (store_sum_ab_n) s_ab_n <= {sub32_d, s_ab_n[32*OPERAND_NUM_WORDS-1:32]};
+			//
+		end
+	
+	
+		//
+		// Word Index Increment Logic
+		//
+	always @(posedge clk)
+		//
+		if (rdy) begin
+			//
+			index_ab		<= WORD_INDEX_ZERO;
+			index_n		<= WORD_INDEX_ZERO;
+			index_s		<= WORD_INDEX_ZERO;
+			//
+		end else begin
+			//
+			if (inc_index_ab) index_ab <= WORD_INDEX_NEXT_OR_ZERO(index_ab);
+			if (inc_index_n)	index_n	<= WORD_INDEX_NEXT_OR_ZERO(index_n);
+			if (inc_index_s)	index_s	<= WORD_INDEX_NEXT_OR_ZERO(index_s);
+			//
+		end
+	
+	
+			//
+			// Output Sum Selector
+			//
+	wire	mux_select_ab = sub32_borrow_latch && !add32_carry_latch;
+			
+	
+			//
+			// Output Data and Write Enable Logic
+			//
+	reg				s_wren_reg;
+	reg	[31: 0]	s_dout_reg;
+	wire	[31: 0]	s_dout_mux = mux_select_ab ? s_ab[31:0] : s_ab_n[31:0];
+	
+	assign s_wren = s_wren_reg;
+	assign s_dout = s_dout_reg;
+	
+	always @(posedge clk)
+		//
+		if (rdy) begin
+			//
+			s_wren_reg	<= 1'b0;
+			s_dout_reg	<= {32{1'bX}};
+			//
+		end else begin
+			//
+			s_wren_reg <= store_data_s;
+			s_dout_reg <= store_data_s ? s_dout_mux : {32{1'bX}};
+			//
+		end			
+
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_copy.v b/rtl/modular/modular_invertor/helper/modinv_helper_copy.v
new file mode 100644
index 0000000..07c1b4f
--- /dev/null
+++ b/rtl/modular/modular_invertor/helper/modinv_helper_copy.v
@@ -0,0 +1,148 @@
+`timescale 1ns / 1ps
+
+module modinv_helper_copy
+	(
+		clk, rst_n,
+		ena, rdy,
+		s_addr,  s_din,
+		a1_addr,        a1_wren, a1_dout
+	);
+	
+	
+		//
+		// Parameters
+		//
+	parameter OPERAND_NUM_WORDS	= 8;
+	parameter OPERAND_ADDR_BITS	= 3;
+	
+	parameter BUFFER_NUM_WORDS		= 9;
+	parameter BUFFER_ADDR_BITS		= 4;
+	
+	
+		//
+		// clog2
+		//
+`include "..\modinv_clog2.v"
+	
+	
+		//
+		// Constants
+		//
+	localparam PROC_NUM_CYCLES	= OPERAND_NUM_WORDS + 2;
+	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
+	
+	
+		//
+		// Ports
+		//
+	input		wire									clk;
+	input		wire									rst_n;
+	
+	input		wire									ena;
+	output	wire									rdy;
+
+	output	wire	[ BUFFER_ADDR_BITS-1:0]	s_addr;
+	output	wire	[OPERAND_ADDR_BITS-1:0]	a1_addr;
+	
+	output	wire									a1_wren;
+	
+	input		wire	[                 31:0]	s_din;
+
+	output	wire	[                 31:0]	a1_dout;
+
+
+		//
+		// Counter
+		//
+	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
+
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
+																	proc_cnt + 1'b1 : proc_cnt_zero;
+	
+		//
+		// Addresses
+		//
+	reg	[OPERAND_ADDR_BITS-1:0]	addr_s;
+
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_s_max		= OPERAND_NUM_WORDS - 1;
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_s_zero		= {OPERAND_ADDR_BITS{1'b0}};
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_s_next		= (addr_s < addr_s_max) ?
+																		addr_s + 1'b1 : addr_s_zero;
+																		
+	reg	[OPERAND_ADDR_BITS-1:0]	addr_a1;
+	
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_a1_max		= OPERAND_NUM_WORDS - 1;
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_a1_zero	= {OPERAND_ADDR_BITS{1'b0}};
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_a1_next	= (addr_a1 < addr_a1_max) ?
+																		addr_a1 + 1'b1 : addr_a1_zero;
+																		
+	assign s_addr  = {{(BUFFER_ADDR_BITS - OPERAND_ADDR_BITS){1'b0}}, addr_s};
+	assign a1_addr = addr_a1;
+	
+		
+		//
+		// Ready Flag
+		//
+	assign rdy = (proc_cnt == proc_cnt_zero);
+	
+	
+		//
+		// Address Increment Logic
+		//
+	wire	inc_addr_s;
+	wire	inc_addr_a1;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_s_start		= 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_s_stop		= OPERAND_NUM_WORDS + 0;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_a1_start	= 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_a1_stop		= OPERAND_NUM_WORDS + 1;
+
+	assign inc_addr_s		= (proc_cnt >= cnt_inc_addr_s_start)  && (proc_cnt <= cnt_inc_addr_s_stop);
+	assign inc_addr_a1	= (proc_cnt >= cnt_inc_addr_a1_start) && (proc_cnt <= cnt_inc_addr_a1_stop);
+	
+	always @(posedge clk) begin
+		//
+		if (inc_addr_s)	addr_s <= addr_s_next;
+		else					addr_s <= addr_s_zero;
+		//
+		if (inc_addr_a1)	addr_a1 <= addr_a1_next;
+		else					addr_a1 <= addr_a1_zero;
+		//
+	end
+	
+	
+		//
+		// Write Enable Logic
+		//
+	wire	wren_a1;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_a1_start	= 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_a1_stop	= OPERAND_NUM_WORDS + 1;
+
+	assign wren_a1 = (proc_cnt >= cnt_wren_a1_start) && (proc_cnt <= cnt_wren_a1_stop);
+
+	assign a1_wren = wren_a1;
+	
+	
+		//
+		// Data Logic
+		//
+	assign a1_dout = s_din;
+	
+	
+		//
+		// Primary Counter Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
+		else begin
+			if (!rdy)		proc_cnt <= proc_cnt_next;
+			else if (ena)	proc_cnt <= proc_cnt_next;
+		end
+
+
+endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_init.v b/rtl/modular/modular_invertor/helper/modinv_helper_init.v
new file mode 100644
index 0000000..0468134
--- /dev/null
+++ b/rtl/modular/modular_invertor/helper/modinv_helper_init.v
@@ -0,0 +1,172 @@
+`timescale 1ns / 1ps
+
+module modinv_helper_init
+	(
+		clk, rst_n,
+		ena, rdy,
+		a_addr, a_din,
+		q_addr, q_din,
+		r_addr, r_wren, r_dout,
+		s_addr, s_wren, s_dout,
+		u_addr, u_wren, u_dout,
+		v_addr, v_wren, v_dout
+	);
+	
+	
+		//
+		// Parameters
+		//
+	parameter OPERAND_NUM_WORDS	= 8;
+	parameter OPERAND_ADDR_BITS	= 3;
+	
+	parameter BUFFER_NUM_WORDS		= 9;
+	parameter BUFFER_ADDR_BITS		= 4;
+	
+	
+		//
+		// clog2
+		//
+`include "..\modinv_clog2.v"
+	
+	
+		//
+		// Constants
+		//
+	localparam PROC_NUM_CYCLES	= OPERAND_NUM_WORDS + 3;
+	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
+	
+	
+		//
+		// Ports
+		//
+	input		wire									clk;
+	input		wire									rst_n;
+	input		wire									ena;
+	output	wire									rdy;
+
+	output	wire	[OPERAND_ADDR_BITS-1:0]	a_addr;
+	output	wire	[OPERAND_ADDR_BITS-1:0]	q_addr;
+	output	wire	[ BUFFER_ADDR_BITS-1:0]	r_addr;
+	output	wire	[ BUFFER_ADDR_BITS-1:0]	s_addr;
+	output	wire	[ BUFFER_ADDR_BITS-1:0]	u_addr;
+	output	wire	[ BUFFER_ADDR_BITS-1:0]	v_addr;
+	
+	output	wire									r_wren;
+	output	wire									s_wren;
+	output	wire									u_wren;
+	output	wire									v_wren;
+	
+	input		wire	[                 31:0]	a_din;
+	input		wire	[                 31:0]	q_din;
+	output	wire	[                 31:0]	r_dout;
+	output	wire	[                 31:0]	s_dout;
+	output	wire	[                 31:0]	u_dout;
+	output	wire	[                 31:0]	v_dout;
+
+
+		//
+		// Counter
+		//
+	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
+
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
+																	proc_cnt + 1'b1 : proc_cnt_zero;
+	
+		//
+		// Addresses
+		//
+	reg	[OPERAND_ADDR_BITS-1:0]	addr_aq;
+
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_aq_max		= OPERAND_NUM_WORDS - 1;
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_aq_zero	= {OPERAND_ADDR_BITS{1'b0}};
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_aq_next	= (addr_aq < addr_aq_max) ?
+																		addr_aq + 1'b1 : addr_aq_zero;
+																		
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_rsuv;
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_rsuv_max	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_rsuv_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_rsuv_next	= (addr_rsuv < addr_rsuv_max) ?
+																		addr_rsuv + 1'b1 : addr_rsuv_zero;
+																		
+	assign a_addr = addr_aq;
+	assign q_addr = addr_aq;
+	
+	assign r_addr = addr_rsuv;
+	assign s_addr = addr_rsuv;
+	assign u_addr = addr_rsuv;
+	assign v_addr = addr_rsuv;
+	
+		
+		//
+		// Ready Flag
+		//
+	assign rdy = (proc_cnt == proc_cnt_zero);
+	
+	
+		//
+		// Address Increment Logic
+		//
+	wire	inc_addr_aq;
+	wire	inc_addr_rsuv;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_aq_start	= 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_aq_stop		= OPERAND_NUM_WORDS;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_rsuv_start	= 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_rsuv_stop	= BUFFER_NUM_WORDS + 1;
+
+	assign inc_addr_aq   = (proc_cnt >= cnt_inc_addr_aq_start)   && (proc_cnt <= cnt_inc_addr_aq_stop);
+	assign inc_addr_rsuv = (proc_cnt >= cnt_inc_addr_rsuv_start) && (proc_cnt <= cnt_inc_addr_rsuv_stop);
+	
+	always @(posedge clk) begin
+		//
+		if (inc_addr_aq)	addr_aq <= addr_aq_next;
+		else					addr_aq <= addr_aq_zero;
+		//
+		if (inc_addr_rsuv)	addr_rsuv <= addr_rsuv_next;
+		else						addr_rsuv <= addr_rsuv_zero;
+		//
+	end
+	
+	
+		//
+		// Write Enable Logic
+		//
+	wire	wren_rsuv;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_rsuv_start	= 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_rsuv_stop	= BUFFER_NUM_WORDS + 1;
+
+	assign wren_rsuv = (proc_cnt >= cnt_wren_rsuv_start) && (proc_cnt <= cnt_wren_rsuv_stop);
+
+	assign r_wren = wren_rsuv;
+	assign s_wren = wren_rsuv;
+	assign u_wren = wren_rsuv;
+	assign v_wren = wren_rsuv;
+	
+	
+		//
+		// Data Logic
+		//
+	assign r_dout = 32'd0;
+	assign s_dout = (proc_cnt == cnt_wren_rsuv_start) ? 32'd1 : 32'd0;
+	assign u_dout = (proc_cnt != cnt_wren_rsuv_stop)  ? q_din : 32'd0;
+	assign v_dout = (proc_cnt != cnt_wren_rsuv_stop)  ? a_din : 32'd0;
+	
+	
+		//
+		// Primary Counter Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
+		else begin
+			if (!rdy)		proc_cnt <= proc_cnt_next;
+			else if (ena)	proc_cnt <= proc_cnt_next;
+		end
+
+
+endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_invert_compare.v b/rtl/modular/modular_invertor/helper/modinv_helper_invert_compare.v
new file mode 100644
index 0000000..6b65eb1
--- /dev/null
+++ b/rtl/modular/modular_invertor/helper/modinv_helper_invert_compare.v
@@ -0,0 +1,286 @@
+`timescale 1ns / 1ps
+
+module modinv_helper_invert_compare
+	(
+		clk, rst_n,
+		ena, rdy,
+		
+		u_addr, u_din,
+		v_addr, v_din,
+		
+		u_gt_v, v_eq_1,
+		u_is_even, v_is_even
+	);
+	
+
+		//
+		// Parameters
+		//
+	parameter BUFFER_NUM_WORDS		= 9;
+	parameter BUFFER_ADDR_BITS		= 4;
+	
+	
+		//
+		// clog2
+		//
+`include "..\modinv_clog2.v"
+	
+	
+		//
+		// Constants
+		//
+	localparam PROC_NUM_CYCLES	= 1 * BUFFER_NUM_WORDS + 10;
+	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
+	
+	
+		//
+		// Ports
+		//
+	input		wire									clk;
+	input		wire									rst_n;
+	input		wire									ena;
+	output	wire									rdy;
+
+	output	wire	[BUFFER_ADDR_BITS-1:0]	u_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	v_addr;
+		
+	input		wire	[              32-1:0]	u_din;
+	input		wire	[              32-1:0]	v_din;
+		
+	output	wire									u_gt_v;
+	output	wire									v_eq_1;
+	output	wire									u_is_even;
+	output	wire									v_is_even;
+
+
+		//
+		// Counter
+		//
+	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
+
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
+																	proc_cnt + 1'b1 : proc_cnt_zero;
+	
+		//
+		// Addresses
+		//
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_in;
+
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_last	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_prev	= (addr_in > addr_in_zero) ?
+																		addr_in - 1'b1 : addr_in_last;
+																			
+	assign u_addr					= addr_in;
+	assign v_addr					= addr_in;	
+	
+	
+		//
+		// Ready Flag
+		//
+	assign rdy = (proc_cnt == proc_cnt_zero);
+	
+	
+		//
+		// Address Decrement Logic
+		//
+	wire	dec_addr_in;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_in_start	= 0 * BUFFER_NUM_WORDS + 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_in_stop		= 1 * BUFFER_NUM_WORDS + 0;
+	
+	assign dec_addr_in   = (proc_cnt >= cnt_dec_addr_in_start)   && (proc_cnt <= cnt_dec_addr_in_stop);
+	
+	always @(posedge clk)
+		//
+		if (rdy)						addr_in <= addr_in_last;
+		else if (dec_addr_in)	addr_in <= addr_in_prev;
+	
+	
+		//
+		// Comparison Stage Flags
+		//
+	wire	calc_leg;
+	wire	calc_leg_final;
+	wire	calc_parity;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_calc_leg_start	= 0 * BUFFER_NUM_WORDS + 3;
+	wire	[PROC_CNT_BITS-1:0]	cnt_calc_leg_stop		= 1 * BUFFER_NUM_WORDS + 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_calc_parity		= 1 * BUFFER_NUM_WORDS + 1;
+	
+	assign calc_leg = (proc_cnt >= cnt_calc_leg_start) && (proc_cnt <= cnt_calc_leg_stop);
+	assign calc_leg_final = (proc_cnt == cnt_calc_leg_stop);
+	assign calc_parity = (proc_cnt == cnt_calc_parity);
+
+	
+		//
+		// Dummy Input
+		//
+	reg	sub32_din_1_lsb;
+	wire	[31: 0]	sub32_din_1 = {{31{1'b0}}, sub32_din_1_lsb};
+	
+	always @(posedge clk)
+		//
+		sub32_din_1_lsb <= (addr_in == addr_in_zero) ? 1'b1 : 1'b0;
+	
+	
+		//
+		// Subtractor (u - v)
+		//
+	wire	[31: 0]	sub32_u_minus_v_difference_out;
+	wire				sub32_u_minus_v_borrow_in;
+	wire				sub32_u_minus_v_borrow_out;
+	
+	subtractor32_wrapper sub32_u_minus_v
+	(
+		.clk		(clk),
+		.a			(u_din),
+		.b			(v_din),
+		.d			(sub32_u_minus_v_difference_out),
+		.b_in		(sub32_u_minus_v_borrow_in),
+		.b_out	(sub32_u_minus_v_borrow_out)
+	);
+	
+	
+		//
+		// Subtractor (v - 1)
+		//
+	wire	[31: 0]	sub32_v_minus_1_difference_out;
+	wire				sub32_v_minus_1_borrow_in;
+	wire				sub32_v_minus_1_borrow_out;
+	
+	subtractor32_wrapper sub32_v_minus_1
+	(
+		.clk		(clk),
+		.a			(v_din),
+		.b			(sub32_din_1),
+		.d			(sub32_v_minus_1_difference_out),
+		.b_in		(sub32_v_minus_1_borrow_in),
+		.b_out	(sub32_v_minus_1_borrow_out)
+	);
+	
+	
+	
+		//
+		// Borrow Masking Logic
+		//
+	reg	mask_borrow;
+	
+	always @(posedge clk)
+		//
+		mask_borrow <= ((proc_cnt > cnt_dec_addr_in_start) && (proc_cnt <= cnt_dec_addr_in_stop)) ?
+			1'b0 : 1'b1;
+		
+	assign sub32_u_minus_v_borrow_in = sub32_u_minus_v_borrow_out & ~mask_borrow;
+	assign sub32_v_minus_1_borrow_in = sub32_v_minus_1_borrow_out & ~mask_borrow;
+	
+		
+		//
+		// Comparison Logic
+		//
+	reg	cmp_u_v_l;
+	reg	cmp_u_v_e;
+	reg	cmp_u_v_g;
+
+	reg	cmp_v_1_l;
+	reg	cmp_v_1_e;
+	reg	cmp_v_1_g;
+
+	wire	cmp_unresolved_u_v = !(cmp_u_v_l || cmp_u_v_g);
+	wire	cmp_unresolved_v_1 = !(cmp_v_1_l || cmp_v_1_g);
+
+	wire	cmp_u_v_borrow_is_set			= (sub32_u_minus_v_borrow_out     ==  1'b1) ? 1'b1 : 1'b0;
+	wire	cmp_u_v_difference_is_nonzero	= (sub32_u_minus_v_difference_out != 32'd0) ? 1'b1 : 1'b0;
+
+	wire	cmp_v_1_borrow_is_set			= (sub32_v_minus_1_borrow_out     ==  1'b1) ? 1'b1 : 1'b0;
+	wire	cmp_v_1_difference_is_nonzero	= (sub32_v_minus_1_difference_out != 32'd0) ? 1'b1 : 1'b0;
+
+	reg	u_is_even_reg;
+	reg	v_is_even_reg;
+
+	always @(posedge clk)
+		//
+		if (rdy) begin
+			//
+			if (ena) begin
+				//
+				cmp_u_v_l		<= 1'b0;
+				cmp_u_v_e		<= 1'b0;
+				cmp_u_v_g		<= 1'b0;
+				//
+				cmp_v_1_l		<= 1'b0;
+				cmp_v_1_e		<= 1'b0;
+				cmp_v_1_g		<= 1'b0;
+				//
+				u_is_even_reg	<= 1'bX;
+				v_is_even_reg	<= 1'bX;
+				//
+			end
+			//
+		end else begin
+			//
+			// parity
+			//
+			if (calc_parity) begin
+				u_is_even_reg <= ~u_din[0];
+				v_is_even_reg <= ~v_din[0];
+			end
+			//
+			// u <> v
+			//
+			if (cmp_unresolved_u_v && calc_leg) begin
+				//
+				if (cmp_u_v_borrow_is_set)
+					cmp_u_v_l <= 1'b1;
+				//
+				if (!cmp_u_v_borrow_is_set && cmp_u_v_difference_is_nonzero)
+					cmp_u_v_g <= 1'b1;
+				//
+				if (!cmp_u_v_borrow_is_set && !cmp_u_v_difference_is_nonzero && calc_leg_final)
+					cmp_u_v_e <= 1'b1;
+				//
+			end
+			//
+			// v <> 1
+			//
+			if (cmp_unresolved_v_1 && calc_leg) begin
+				//
+				if (cmp_v_1_borrow_is_set)
+					cmp_v_1_l <= 1'b1;
+				//
+				if (!cmp_v_1_borrow_is_set && cmp_v_1_difference_is_nonzero)
+					cmp_v_1_g <= 1'b1;
+				//
+				if (!cmp_v_1_borrow_is_set && !cmp_v_1_difference_is_nonzero && calc_leg_final)
+					cmp_v_1_e <= 1'b1;
+				//
+			end			
+			//
+		end
+
+
+		//
+		// Output Flags
+		//
+	assign u_gt_v = !cmp_u_v_l && !cmp_u_v_e &&  cmp_u_v_g;
+	assign v_eq_1 = !cmp_v_1_l &&  cmp_v_1_e && !cmp_v_1_g;
+	
+	assign u_is_even = u_is_even_reg;
+	assign v_is_even = v_is_even_reg;
+
+
+		//
+		// Primary Counter Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
+		else begin
+			if (!rdy)		proc_cnt <= proc_cnt_next;
+			else if (ena)	proc_cnt <= proc_cnt_next;
+		end
+
+
+endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_invert_precalc.v b/rtl/modular/modular_invertor/helper/modinv_helper_invert_precalc.v
new file mode 100644
index 0000000..ab15563
--- /dev/null
+++ b/rtl/modular/modular_invertor/helper/modinv_helper_invert_precalc.v
@@ -0,0 +1,408 @@
+`timescale 1ns / 1ps
+
+module modinv_helper_invert_precalc
+	(
+		clk, rst_n,
+		ena, rdy,
+		
+		r_addr, r_din,
+		s_addr, s_din,
+		u_addr, u_din,
+		v_addr, v_din,
+		
+		r_dbl_addr,          r_dbl_wren,          r_dbl_dout,
+		s_dbl_addr,          s_dbl_wren,          s_dbl_dout,
+		r_plus_s_addr,       r_plus_s_wren,       r_plus_s_dout,
+		u_half_addr,         u_half_wren,         u_half_dout,
+		v_half_addr,         v_half_wren,         v_half_dout,
+		u_minus_v_addr,      u_minus_v_wren,      u_minus_v_dout,      u_minus_v_din,
+		v_minus_u_addr,      v_minus_u_wren,      v_minus_u_dout,      v_minus_u_din,
+		u_minus_v_half_addr, u_minus_v_half_wren, u_minus_v_half_dout,
+		v_minus_u_half_addr, v_minus_u_half_wren, v_minus_u_half_dout
+	);
+	
+
+		//
+		// Parameters
+		//
+	parameter BUFFER_NUM_WORDS		= 9;
+	parameter BUFFER_ADDR_BITS		= 4;
+	
+	
+		//
+		// clog2
+		//
+`include "..\modinv_clog2.v"
+	
+	
+		//
+		// Constants
+		//
+	localparam PROC_NUM_CYCLES	= 2 * BUFFER_NUM_WORDS + 4;
+	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
+	
+	
+		//
+		// Ports
+		//
+	input		wire									clk;
+	input		wire									rst_n;
+	input		wire									ena;
+	output	wire									rdy;
+
+	output	wire	[BUFFER_ADDR_BITS-1:0]	r_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	s_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	u_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	v_addr;
+		
+	input		wire	[              32-1:0]	r_din;
+	input		wire	[              32-1:0]	s_din;
+	input		wire	[              32-1:0]	u_din;
+	input		wire	[              32-1:0]	v_din;
+		
+	output	wire	[BUFFER_ADDR_BITS-1:0]	r_dbl_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	s_dbl_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	r_plus_s_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	u_half_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	v_half_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	u_minus_v_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	v_minus_u_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	u_minus_v_half_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	v_minus_u_half_addr;
+		
+	output	wire	[              32-1:0]	r_dbl_dout;
+	output	wire	[              32-1:0]	s_dbl_dout;
+	output	wire	[              32-1:0]	r_plus_s_dout;
+	output	wire	[              32-1:0]	u_half_dout;
+	output	wire	[              32-1:0]	v_half_dout;
+	output	wire	[              32-1:0]	u_minus_v_dout;
+	output	wire	[              32-1:0]	v_minus_u_dout;
+	output	wire	[              32-1:0]	u_minus_v_half_dout;
+	output	wire	[              32-1:0]	v_minus_u_half_dout;
+		
+	output	wire									r_dbl_wren;
+	output	wire									s_dbl_wren;
+	output	wire									r_plus_s_wren;
+	output	wire									u_half_wren;
+	output	wire									v_half_wren;
+	output	wire									u_minus_v_wren;
+	output	wire									v_minus_u_wren;
+	output	wire									u_minus_v_half_wren;
+	output	wire									v_minus_u_half_wren;
+	
+	input		wire	[              32-1:0]	u_minus_v_din;
+	input		wire	[              32-1:0]	v_minus_u_din;
+	
+
+
+		//
+		// Counter
+		//
+	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
+
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
+																	proc_cnt + 1'b1 : proc_cnt_zero;
+	
+		//
+		// Addresses
+		//
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_in;
+
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_last	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_next	= (addr_in < addr_in_last) ?
+																		addr_in + 1'b1 : addr_in_zero;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_prev	= (addr_in > addr_in_zero) ?
+																		addr_in - 1'b1 : addr_in_zero;
+																		
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_out1;
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_last	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_next	= (addr_out1 < addr_out1_last) ?
+																		addr_out1 + 1'b1 : addr_out1_zero;
+																		
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_out2;
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_last	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_next	= (addr_out2 < addr_out2_last) ?
+																		addr_out2 + 1'b1 : addr_out2_zero;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_prev	= (addr_out2 > addr_out2_zero) ?
+																		addr_out2 - 1'b1 : addr_out2_zero;
+																		
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_out3;
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_last	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_prev	= (addr_out3 > addr_out3_zero) ?
+																		addr_out3 - 1'b1 : addr_out3_last;
+
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_out4;
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out4_last	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out4_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out4_prev	= (addr_out4 > addr_out4_zero) ?
+																		addr_out4 - 1'b1 : addr_out4_last;
+
+	
+	assign r_addr					= addr_in;
+	assign s_addr					= addr_in;
+	assign u_addr					= addr_in;
+	assign v_addr					= addr_in;
+		
+	assign r_dbl_addr				= addr_out1;
+	assign s_dbl_addr				= addr_out1;
+	assign r_plus_s_addr			= addr_out2;
+	assign u_half_addr			= addr_out3;
+	assign v_half_addr			= addr_out3;
+	assign u_minus_v_addr		= addr_out2;
+	assign v_minus_u_addr		= addr_out2;
+	assign u_minus_v_half_addr	= addr_out4;
+	assign v_minus_u_half_addr	= addr_out4;
+	
+		
+		//
+		// Ready Flag
+		//
+	assign rdy = (proc_cnt == proc_cnt_zero);
+	
+	
+		//
+		// Address Increment/Decrement Logic
+		//
+	wire	inc_addr_in;
+	wire	dec_addr_in;
+	wire	inc_addr_out1;
+	wire	inc_addr_out2;
+	wire	dec_addr_out2;
+	wire	dec_addr_out3;
+	wire	dec_addr_out4;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_start	= 0 * BUFFER_NUM_WORDS + 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_stop		= 1 * BUFFER_NUM_WORDS - 1;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out1_start	= 0 * BUFFER_NUM_WORDS + 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out1_stop	= 1 * BUFFER_NUM_WORDS + 1;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out2_start	= 0 * BUFFER_NUM_WORDS + 3;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out2_stop	= 1 * BUFFER_NUM_WORDS + 1;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out2_start	= 1 * BUFFER_NUM_WORDS + 3;
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out2_stop	= 2 * BUFFER_NUM_WORDS + 1;	
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_in_start	= 1 * BUFFER_NUM_WORDS + 0;
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_in_stop		= 2 * BUFFER_NUM_WORDS - 2;	
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out3_start	= 1 * BUFFER_NUM_WORDS + 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out3_stop	= 2 * BUFFER_NUM_WORDS + 0;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out4_start	= 1 * BUFFER_NUM_WORDS + 4;
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out4_stop	= 2 * BUFFER_NUM_WORDS + 3;	
+
+	assign inc_addr_in   = (proc_cnt >= cnt_inc_addr_in_start)   && (proc_cnt <= cnt_inc_addr_in_stop);
+	assign dec_addr_in   = (proc_cnt >= cnt_dec_addr_in_start)   && (proc_cnt <= cnt_dec_addr_in_stop);
+	assign inc_addr_out1 = (proc_cnt >= cnt_inc_addr_out1_start) && (proc_cnt <= cnt_inc_addr_out1_stop);
+	assign inc_addr_out2 = (proc_cnt >= cnt_inc_addr_out2_start) && (proc_cnt <= cnt_inc_addr_out2_stop);
+	assign dec_addr_out2 = (proc_cnt >= cnt_dec_addr_out2_start) && (proc_cnt <= cnt_dec_addr_out2_stop);
+	assign dec_addr_out3 = (proc_cnt >= cnt_dec_addr_out3_start) && (proc_cnt <= cnt_dec_addr_out3_stop);
+	assign dec_addr_out4 = (proc_cnt >= cnt_dec_addr_out4_start) && (proc_cnt <= cnt_dec_addr_out4_stop);
+	
+	
+	always @(posedge clk) begin
+		//
+		if (rdy) begin
+			//
+			addr_in 		<= addr_in_zero;
+			addr_out1	<= addr_out1_zero;
+			addr_out2	<= addr_out2_zero;
+			addr_out3	<= addr_out3_last;
+			addr_out4	<= addr_out4_last;
+			//
+		end else begin
+			//
+			if (inc_addr_in)				addr_in <= addr_in_next;
+			else if (dec_addr_in)		addr_in <= addr_in_prev;
+			//
+			if (inc_addr_out1)			addr_out1 <= addr_out1_next;
+			else								addr_out1 <= addr_out1_zero;
+			//
+			if (inc_addr_out2)			addr_out2 <= addr_out2_next;
+			else if (dec_addr_out2)		addr_out2 <= addr_out2_prev;
+			//
+			if (dec_addr_out3)			addr_out3 <= addr_out3_prev;
+			else								addr_out3 <= addr_out3_last;
+			//
+			if (dec_addr_out4)			addr_out4 <= addr_out4_prev;
+			else								addr_out4 <= addr_out4_last;
+			//
+		end
+		//
+	end
+	
+	
+		//
+		// Write Enable Logic
+		//
+	wire	wren_out1;
+	wire	wren_out2;
+	wire	wren_out3;
+	wire	wren_out4;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out1_start	= 0 * BUFFER_NUM_WORDS + 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out1_stop	= 1 * BUFFER_NUM_WORDS + 1;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out2_start	= 0 * BUFFER_NUM_WORDS + 3;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out2_stop	= 1 * BUFFER_NUM_WORDS + 2;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out3_start	= 1 * BUFFER_NUM_WORDS + 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out3_stop	= 2 * BUFFER_NUM_WORDS + 0;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out4_start	= 1 * BUFFER_NUM_WORDS + 4;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out4_stop	= 2 * BUFFER_NUM_WORDS + 3;
+
+	assign wren_out1 = (proc_cnt >= cnt_wren_out1_start) && (proc_cnt <= cnt_wren_out1_stop);
+	assign wren_out2 = (proc_cnt >= cnt_wren_out2_start) && (proc_cnt <= cnt_wren_out2_stop);
+	assign wren_out3 = (proc_cnt >= cnt_wren_out3_start) && (proc_cnt <= cnt_wren_out3_stop);
+	assign wren_out4 = (proc_cnt >= cnt_wren_out4_start) && (proc_cnt <= cnt_wren_out4_stop);
+
+	assign r_dbl_wren				= wren_out1;
+	assign s_dbl_wren				= wren_out1;
+	assign r_plus_s_wren			= wren_out2;
+	assign u_half_wren			= wren_out3;
+	assign v_half_wren			= wren_out3;
+	assign u_minus_v_wren		= wren_out2;
+	assign v_minus_u_wren		= wren_out2;
+	assign u_minus_v_half_wren	= wren_out4;
+	assign v_minus_u_half_wren	= wren_out4;
+
+
+		//
+		// Adder (r + s)
+		//
+	wire	[31: 0]	add32_r_plus_s_sum_out;
+	wire				add32_r_plus_s_carry_in;
+	wire				add32_r_plus_s_carry_out;
+	
+	adder32_wrapper add32_r_plus_s
+	(
+		.clk		(clk),
+		.a			(r_din),
+		.b			(s_din),
+		.s			(add32_r_plus_s_sum_out),
+		.c_in		(add32_r_plus_s_carry_in),
+		.c_out	(add32_r_plus_s_carry_out)
+	);
+	
+		//
+		// Subtractor (u - v)
+		//
+	wire	[31: 0]	sub32_u_minus_v_difference_out;
+	wire				sub32_u_minus_v_borrow_in;
+	wire				sub32_u_minus_v_borrow_out;
+	
+	subtractor32_wrapper sub32_u_minus_v
+	(
+		.clk		(clk),
+		.a			(u_din),
+		.b			(v_din),
+		.d			(sub32_u_minus_v_difference_out),
+		.b_in		(sub32_u_minus_v_borrow_in),
+		.b_out	(sub32_u_minus_v_borrow_out)
+	);
+	
+		//
+		// Subtractor (v - u)
+		//
+	wire	[31: 0]	sub32_v_minus_u_difference_out;
+	wire				sub32_v_minus_u_borrow_in;
+	wire				sub32_v_minus_u_borrow_out;
+	
+	subtractor32_wrapper sub32_v_minus_u
+	(
+		.clk		(clk),
+		.a			(v_din),
+		.b			(u_din),
+		.d			(sub32_v_minus_u_difference_out),
+		.b_in		(sub32_v_minus_u_borrow_in),
+		.b_out	(sub32_v_minus_u_borrow_out)
+	);
+	
+	
+		//
+		// Carry & Borrow Masking Logic
+		//
+	reg	mask_carry_borrow;
+	
+	always @(posedge clk)
+		//
+		mask_carry_borrow <= ((proc_cnt >= cnt_wren_out1_start) && (proc_cnt < cnt_wren_out1_stop)) ?
+			1'b0 : 1'b1;
+		
+	assign add32_r_plus_s_carry_in   = add32_r_plus_s_carry_out   & ~mask_carry_borrow;
+	assign sub32_u_minus_v_borrow_in = sub32_u_minus_v_borrow_out & ~mask_carry_borrow;
+	assign sub32_v_minus_u_borrow_in = sub32_v_minus_u_borrow_out & ~mask_carry_borrow;
+	
+	
+		//
+		// Carry Bits
+		//
+	reg	r_dbl_carry;
+	reg	s_dbl_carry;
+	reg	u_half_carry;
+	reg	v_half_carry;
+	reg	u_minus_v_half_carry;
+	reg	v_minus_u_half_carry;
+	
+	always @(posedge clk) begin
+		
+		r_dbl_carry					<= ((proc_cnt >= cnt_wren_out1_start) && (proc_cnt < cnt_wren_out1_stop)) ?
+											r_din[31] : 1'b0;
+								
+		s_dbl_carry					<= ((proc_cnt >= cnt_wren_out1_start) && (proc_cnt < cnt_wren_out1_stop)) ?
+											s_din[31] : 1'b0;
+								
+		u_half_carry				<= ((proc_cnt >= cnt_wren_out3_start) && (proc_cnt < cnt_wren_out3_stop)) ?
+											u_din[0] : 1'b0;
+		
+		v_half_carry				<= ((proc_cnt >= cnt_wren_out3_start) && (proc_cnt < cnt_wren_out3_stop)) ?
+											v_din[0] : 1'b0;
+									
+		u_minus_v_half_carry		<= ((proc_cnt >= cnt_wren_out4_start) && (proc_cnt < cnt_wren_out4_stop)) ?
+											u_minus_v_din[0] : 1'b0;
+		
+		v_minus_u_half_carry		<= ((proc_cnt >= cnt_wren_out4_start) && (proc_cnt < cnt_wren_out4_stop)) ?
+											v_minus_u_din[0] : 1'b0;
+
+	end
+	
+	
+		//
+		// Data Mapper
+		//
+	assign r_dbl_dout				= {r_din[30:0], r_dbl_carry};
+	assign s_dbl_dout				= {s_din[30:0], s_dbl_carry};
+	assign r_plus_s_dout			= add32_r_plus_s_sum_out;
+	assign u_half_dout			= {u_half_carry, u_din[31:1]};
+	assign v_half_dout			= {v_half_carry, v_din[31:1]};
+	assign u_minus_v_dout		= sub32_u_minus_v_difference_out;
+	assign v_minus_u_dout		= sub32_v_minus_u_difference_out;
+	assign u_minus_v_half_dout	= {u_minus_v_half_carry, u_minus_v_din[31:1]};
+	assign v_minus_u_half_dout	= {v_minus_u_half_carry, v_minus_u_din[31:1]};
+	
+	
+		//
+		// Primary Counter Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
+		else begin
+			if (!rdy)		proc_cnt <= proc_cnt_next;
+			else if (ena)	proc_cnt <= proc_cnt_next;
+		end
+
+
+endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_invert_update.v b/rtl/modular/modular_invertor/helper/modinv_helper_invert_update.v
new file mode 100644
index 0000000..0cd6ac5
--- /dev/null
+++ b/rtl/modular/modular_invertor/helper/modinv_helper_invert_update.v
@@ -0,0 +1,257 @@
+`timescale 1ns / 1ps
+
+module modinv_helper_invert_update
+	(
+		clk, rst_n,
+		ena, rdy,
+		
+		u_gt_v, v_eq_1,
+		u_is_even, v_is_even,
+		
+		r_addr, r_wren, r_dout,
+		s_addr, s_wren, s_dout,
+		u_addr, u_wren, u_dout,
+		v_addr, v_wren, v_dout,
+		
+		r_dbl_addr,          r_dbl_din,
+		s_dbl_addr,          s_dbl_din,
+		r_plus_s_addr,       r_plus_s_din,
+		u_half_addr,         u_half_din,
+		v_half_addr,         v_half_din,
+		u_minus_v_half_addr, u_minus_v_half_din,
+		v_minus_u_half_addr, v_minus_u_half_din
+	);
+	
+	
+		//
+		// Parameters
+		//
+	parameter BUFFER_NUM_WORDS		= 9;
+	parameter BUFFER_ADDR_BITS		= 4;
+	
+	
+		//
+		// clog2
+		//
+`include "..\modinv_clog2.v"
+	
+	
+		//
+		// Constants
+		//
+	localparam PROC_NUM_CYCLES	= BUFFER_NUM_WORDS + 3;
+	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
+	
+	
+		//
+		// Ports
+		//
+	input		wire									clk;
+	input		wire									rst_n;
+	input		wire									ena;
+	output	wire									rdy;
+
+	input		wire									u_gt_v;
+	input		wire									v_eq_1;
+	input		wire									u_is_even;
+	input		wire									v_is_even;
+		
+	output	wire	[BUFFER_ADDR_BITS-1:0]	r_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	s_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	u_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	v_addr;
+		
+	output	wire									r_wren;
+	output	wire									s_wren;
+	output	wire									u_wren;
+	output	wire									v_wren;
+		
+	output	wire	[              32-1:0]	r_dout;
+	output	wire	[              32-1:0]	s_dout;
+	output	wire	[              32-1:0]	u_dout;
+	output	wire	[              32-1:0]	v_dout;
+		
+	output	wire	[BUFFER_ADDR_BITS-1:0]	r_dbl_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	s_dbl_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	r_plus_s_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	u_half_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	v_half_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	u_minus_v_half_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	v_minus_u_half_addr;
+		
+	input		wire	[              32-1:0]	r_dbl_din;
+	input		wire	[              32-1:0]	s_dbl_din;
+	input		wire	[              32-1:0]	r_plus_s_din;
+	input		wire	[              32-1:0]	u_half_din;
+	input		wire	[              32-1:0]	v_half_din;
+	input		wire	[              32-1:0]	u_minus_v_half_din;
+	input		wire	[              32-1:0]	v_minus_u_half_din;
+		
+	
+		//
+		// Counter
+		//
+	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
+
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
+																	proc_cnt + 1'b1 : proc_cnt_zero;
+	
+		//
+		// Addresses
+		//
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_in;
+
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_max		= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_next	= (addr_in < addr_in_max) ?
+																		addr_in + 1'b1 : addr_in_zero;
+																		
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_out;
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_max	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_next	= (addr_out < addr_out_max) ?
+																		addr_out + 1'b1 : addr_out_zero;
+																		
+	assign r_addr					= addr_out;
+	assign s_addr					= addr_out;
+	assign u_addr					= addr_out;
+	assign v_addr					= addr_out;
+	
+	assign r_dbl_addr				= addr_in;
+	assign s_dbl_addr				= addr_in;
+	assign r_plus_s_addr			= addr_in;
+	assign u_half_addr			= addr_in;
+	assign v_half_addr			= addr_in;
+	assign u_minus_v_half_addr	= addr_in;
+	assign v_minus_u_half_addr	= addr_in;
+	
+	
+		//
+		// Ready Flag
+		//
+	assign rdy = (proc_cnt == proc_cnt_zero);
+	
+	
+		//
+		// Address Increment Logic
+		//
+	wire	inc_addr_in;
+	wire	inc_addr_out;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_start	= 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_stop		= BUFFER_NUM_WORDS;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out_start	= 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out_stop	= BUFFER_NUM_WORDS + 1;
+
+	assign inc_addr_in  = (proc_cnt >= cnt_inc_addr_in_start)  && (proc_cnt <= cnt_inc_addr_in_stop);
+	assign inc_addr_out = (proc_cnt >= cnt_inc_addr_out_start) && (proc_cnt <= cnt_inc_addr_out_stop);
+	
+	always @(posedge clk) begin
+		//
+		if (inc_addr_in)	addr_in <= addr_in_next;
+		else					addr_in <= addr_in_zero;
+		//
+		if (inc_addr_out)	addr_out <= addr_out_next;
+		else					addr_out <= addr_out_zero;
+		//
+	end
+	
+		//
+		// Write Enable Logic
+		//
+	wire	wren_out;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out_start	= 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out_stop		= BUFFER_NUM_WORDS + 1;
+
+	assign wren_out = (proc_cnt >= cnt_wren_out_start) && (proc_cnt <= cnt_wren_out_stop);
+
+	reg	r_wren_allow;
+	reg	s_wren_allow;
+	reg	u_wren_allow;
+	reg	v_wren_allow;
+
+	assign r_wren = wren_out && r_wren_allow && !v_eq_1 && !rdy;
+	assign s_wren = wren_out && s_wren_allow && !v_eq_1 && !rdy;
+	assign u_wren = wren_out && u_wren_allow && !v_eq_1 && !rdy;
+	assign v_wren = wren_out && v_wren_allow && !v_eq_1 && !rdy;
+	
+	
+		//
+		// Data Logic
+		//
+	reg	[31: 0]	r_dout_mux;
+	reg	[31: 0]	s_dout_mux;
+	reg	[31: 0]	u_dout_mux;
+	reg	[31: 0]	v_dout_mux;
+	
+	assign r_dout = r_dout_mux;
+	assign s_dout = s_dout_mux;
+	assign u_dout = u_dout_mux;
+	assign v_dout = v_dout_mux;
+	
+	always @(*) begin
+		//
+		// r, s, u, v
+		//
+		if (u_is_even) begin
+			//
+			u_dout_mux		= u_half_din;
+			v_dout_mux		= {32{1'bX}};
+			r_dout_mux		= {32{1'bX}};
+			s_dout_mux		= s_dbl_din;
+			//
+			u_wren_allow	= 1'b1;
+			v_wren_allow	= 1'b0;
+			r_wren_allow	= 1'b0;
+			s_wren_allow	= 1'b1;
+			//
+		end else begin
+			//
+			if (v_is_even) begin
+				//
+				u_dout_mux		= {32{1'bX}};
+				v_dout_mux		= v_half_din;
+				r_dout_mux		= r_dbl_din;
+				s_dout_mux		= {32{1'bX}};
+				//
+				u_wren_allow	= 1'b0;
+				v_wren_allow	= 1'b1;
+				r_wren_allow	= 1'b1;
+				s_wren_allow	= 1'b0;
+				//
+			end else begin
+				//
+				u_dout_mux		=  u_gt_v ? u_minus_v_half_din : {32{1'bX}};
+				v_dout_mux		=  u_gt_v ? {32{1'bX}}         : v_minus_u_half_din;
+				r_dout_mux		=  u_gt_v ? r_plus_s_din       : r_dbl_din;
+				s_dout_mux		=  u_gt_v ? s_dbl_din          : r_plus_s_din;
+				//
+				u_wren_allow	=  u_gt_v;
+				v_wren_allow	= !u_gt_v;
+				r_wren_allow	=  1'b1;
+				s_wren_allow	=  1'b1;
+				//
+			end
+			//
+		end
+		//
+	end
+		
+		
+		//
+		// Primary Counter Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
+		else begin
+			if (!rdy)		proc_cnt <= proc_cnt_next;
+			else if (ena)	proc_cnt <= proc_cnt_next;
+		end
+
+endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_reduce_precalc.v b/rtl/modular/modular_invertor/helper/modinv_helper_reduce_precalc.v
new file mode 100644
index 0000000..fb858a6
--- /dev/null
+++ b/rtl/modular/modular_invertor/helper/modinv_helper_reduce_precalc.v
@@ -0,0 +1,328 @@
+`timescale 1ns / 1ps
+
+module modinv_helper_reduce_precalc
+	(
+		clk, rst_n,
+		ena, rdy,
+		
+		k,
+		
+		s_is_odd, k_is_nul,
+		
+		r_addr, r_din, r_wren, r_dout,
+		s_addr, s_din,
+		u_addr,        u_wren, u_dout,
+		v_addr,        v_wren, v_dout,
+		q_addr, q_din
+	);
+	
+
+		//
+		// Parameters
+		//
+	parameter OPERAND_NUM_WORDS	= 8;
+	parameter OPERAND_ADDR_BITS	= 3;
+	parameter BUFFER_NUM_WORDS		= 9;
+	parameter BUFFER_ADDR_BITS		= 4;
+	parameter K_NUM_BITS				= 10;
+	
+	
+		//
+		// clog2
+		//
+`include "..\modinv_clog2.v"
+	
+	
+		//
+		// Constants
+		//
+	localparam PROC_NUM_CYCLES	= 2 * BUFFER_NUM_WORDS + 4;
+	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
+	
+	
+		//
+		// Ports
+		//
+	input		wire									clk;
+	input		wire									rst_n;
+	input		wire									ena;
+	output	wire									rdy;
+
+	input		wire	[       K_NUM_BITS-1:0]	k;
+		
+	output	wire									s_is_odd;
+	output	wire									k_is_nul;
+
+	output	wire	[ BUFFER_ADDR_BITS-1:0]	r_addr;
+	output	wire	[ BUFFER_ADDR_BITS-1:0]	s_addr;
+	output	wire	[ BUFFER_ADDR_BITS-1:0]	u_addr;
+	output	wire	[ BUFFER_ADDR_BITS-1:0]	v_addr;
+	output	wire	[OPERAND_ADDR_BITS-1:0]	q_addr;
+
+	input		wire	[              32-1:0]	r_din;
+	input		wire	[              32-1:0]	s_din;
+	input		wire	[              32-1:0]	q_din;
+	
+	output	wire									r_wren;
+	output	wire									u_wren;
+	output	wire									v_wren;
+	
+	output	wire	[              32-1:0]	r_dout;
+	output	wire	[              32-1:0]	u_dout;
+	output	wire	[              32-1:0]	v_dout;
+				
+
+		//
+		// Counter
+		//
+	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
+
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
+																	proc_cnt + 1'b1 : proc_cnt_zero;
+	
+		//
+		// Addresses
+		//
+	reg	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf;
+	reg	[OPERAND_ADDR_BITS-1:0]	addr_in_op;
+	reg	[ BUFFER_ADDR_BITS-1:0]	addr_out1;
+	reg	[ BUFFER_ADDR_BITS-1:0]	addr_out2;
+	reg	[ BUFFER_ADDR_BITS-1:0]	addr_out3;
+
+	wire	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf_last	= BUFFER_NUM_WORDS - 1;
+	wire	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf_next	= (addr_in_buf < addr_in_buf_last) ?
+																		addr_in_buf + 1'b1 : addr_in_buf_zero;
+	wire	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf_prev	= (addr_in_buf > addr_in_buf_zero) ?
+																		addr_in_buf - 1'b1 : addr_in_buf_zero;
+
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_in_op_last	= OPERAND_NUM_WORDS - 1;
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_in_op_zero	= {OPERAND_ADDR_BITS{1'b0}};
+	wire	[OPERAND_ADDR_BITS-1:0]	addr_in_op_next	= (addr_in_op < addr_in_op_last) ?
+																		addr_in_op + 1'b1 : addr_in_op_zero;
+																		
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_last	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_next	= (addr_out1 < addr_out1_last) ?
+																		addr_out1 + 1'b1 : addr_out1_zero;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_prev	= (addr_out1 > addr_out1_zero) ?
+																		addr_out1 - 1'b1 : addr_out1_zero;
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_last	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_prev	= (addr_out2 > addr_out2_zero) ?
+																		addr_out2 - 1'b1 : addr_out2_last;
+
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_last	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_prev	= (addr_out3 > addr_out3_zero) ?
+																		addr_out3 - 1'b1 : addr_out3_last;
+
+	
+	assign s_addr = addr_in_buf;
+	assign q_addr = addr_in_op;
+	assign r_addr = addr_out1;
+	assign u_addr = addr_out2;
+	assign v_addr = addr_out3;
+	
+	
+		//
+		// Ready Flag
+		//
+	assign rdy = (proc_cnt == proc_cnt_zero);
+		
+		
+		//
+		// Address Increment/Decrement Logic
+		//
+	wire	inc_addr_buf_in;
+	wire	dec_addr_buf_in;
+	wire	inc_addr_op_in;
+	wire	inc_addr_out1;
+	wire	dec_addr_out1;
+	wire	dec_addr_out2;
+	wire	dec_addr_out3;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_calc_flags					= 0 * BUFFER_NUM_WORDS + 2;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_buf_in_start	= 0 * BUFFER_NUM_WORDS + 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_buf_in_stop	= 1 * BUFFER_NUM_WORDS - 1;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_buf_in_start	= 1 * BUFFER_NUM_WORDS + 0;
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_buf_in_stop	= 2 * BUFFER_NUM_WORDS - 2;	
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_op_in_start	= 0 * OPERAND_NUM_WORDS + 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_op_in_stop		= 1 * OPERAND_NUM_WORDS + 0;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out1_start		= 0 * BUFFER_NUM_WORDS + 3;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out1_stop		= 1 * BUFFER_NUM_WORDS + 1;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out1_start		= 1 * BUFFER_NUM_WORDS + 3;
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out1_stop		= 2 * BUFFER_NUM_WORDS + 1;	
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out2_start		= 1 * BUFFER_NUM_WORDS + 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out2_stop		= 2 * BUFFER_NUM_WORDS + 0;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out3_start		= 1 * BUFFER_NUM_WORDS + 4;
+	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out3_stop		= 2 * BUFFER_NUM_WORDS + 3;
+
+	assign inc_addr_buf_in = (proc_cnt >= cnt_inc_addr_buf_in_start) && (proc_cnt <= cnt_inc_addr_buf_in_stop);
+	assign dec_addr_buf_in = (proc_cnt >= cnt_dec_addr_buf_in_start) && (proc_cnt <= cnt_dec_addr_buf_in_stop);
+	assign inc_addr_op_in  = (proc_cnt >= cnt_inc_addr_op_in_start)  && (proc_cnt <= cnt_inc_addr_op_in_stop);
+	assign inc_addr_out1   = (proc_cnt >= cnt_inc_addr_out1_start) && (proc_cnt <= cnt_inc_addr_out1_stop);
+	assign dec_addr_out1   = (proc_cnt >= cnt_dec_addr_out1_start) && (proc_cnt <= cnt_dec_addr_out1_stop);
+	assign dec_addr_out2   = (proc_cnt >= cnt_dec_addr_out2_start) && (proc_cnt <= cnt_dec_addr_out2_stop);
+	assign dec_addr_out3   = (proc_cnt >= cnt_dec_addr_out3_start) && (proc_cnt <= cnt_dec_addr_out3_stop);
+
+	always @(posedge clk) begin
+		//
+		if (rdy) begin
+			//
+			addr_in_buf		<= addr_in_buf_zero;
+			addr_in_op		<= addr_in_op_zero;
+			addr_out1		<= addr_out1_zero;
+			addr_out2		<= addr_out2_last;
+			addr_out3		<= addr_out3_last;
+			//
+		end else begin
+			//
+			if (inc_addr_buf_in)			addr_in_buf	<= addr_in_buf_next;
+			else if (dec_addr_buf_in)	addr_in_buf	<= addr_in_buf_prev;
+			//
+			if (inc_addr_op_in)			addr_in_op	<= addr_in_op_next;
+			else								addr_in_op	<= addr_in_op_zero;
+			//
+			if (inc_addr_out1)			addr_out1	<= addr_out1_next;
+			else if (dec_addr_out1)		addr_out1	<= addr_out1_prev;
+			//
+			if (dec_addr_out2)			addr_out2	<= addr_out2_prev;
+			else								addr_out2	<= addr_out2_last;
+			//
+			if (dec_addr_out3)			addr_out3	<= addr_out3_prev;
+			else								addr_out3	<= addr_out3_last;
+			//
+		end
+		//
+	end
+	
+	
+		//
+		// Write Enable Logic
+		//
+	wire	wren_out1;
+	wire	wren_out2;
+	wire	wren_out3;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out1_start	= 0 * BUFFER_NUM_WORDS + 3;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out1_stop	= 1 * BUFFER_NUM_WORDS + 2;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out2_start	= 1 * BUFFER_NUM_WORDS + 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out2_stop	= 2 * BUFFER_NUM_WORDS + 0;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out3_start	= 1 * BUFFER_NUM_WORDS + 4;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out3_stop	= 2 * BUFFER_NUM_WORDS + 3;
+
+	assign wren_out1 = (proc_cnt >= cnt_wren_out1_start) && (proc_cnt <= cnt_wren_out1_stop);
+	assign wren_out2 = (proc_cnt >= cnt_wren_out2_start) && (proc_cnt <= cnt_wren_out2_stop);
+	assign wren_out3 = (proc_cnt >= cnt_wren_out3_start) && (proc_cnt <= cnt_wren_out3_stop);
+
+	assign r_wren = wren_out1;
+	assign u_wren = wren_out2;
+	assign v_wren = wren_out3;
+	
+		//
+		// Adder (s + q)
+		//
+	wire	[31: 0]	q_din_masked;
+	wire	[31: 0]	add32_s_plus_q_sum_out;
+	wire				add32_s_plus_q_carry_in;
+	wire				add32_s_plus_q_carry_out;
+	
+	adder32_wrapper add32_r_plus_s
+	(
+		.clk		(clk),
+		.a			(s_din),
+		.b			(q_din_masked),
+		.s			(add32_s_plus_q_sum_out),
+		.c_in		(add32_s_plus_q_carry_in),
+		.c_out	(add32_s_plus_q_carry_out)
+	);
+		
+		
+		//
+		// Carry Masking Logic
+		//
+	wire	mask_carry;
+
+	assign mask_carry = ((proc_cnt >= cnt_wren_out1_start) && (proc_cnt < cnt_wren_out1_stop)) ? 1'b0 : 1'b1;
+
+
+		//
+		// Addend Masking Logic
+		//
+	reg	q_din_mask;
+	
+	always @(posedge clk)
+		q_din_mask <= (addr_in_buf == addr_in_buf_last) ? 1'b1 : 1'b0;
+	
+	assign q_din_masked = q_din_mask ? {32{1'b0}} : q_din;
+	
+	assign add32_s_plus_q_carry_in = add32_s_plus_q_carry_out & ~mask_carry;
+
+
+		//
+		// Carry Bits
+		//
+	reg	s_half_carry;
+	reg	s_plus_q_half_carry;
+	
+	always @(posedge clk) begin
+		//					
+		s_half_carry				<= ((proc_cnt >= cnt_wren_out2_start) && (proc_cnt < cnt_wren_out2_stop)) ?
+											s_din[0] : 1'b0;
+		//
+		s_plus_q_half_carry		<= ((proc_cnt >= cnt_wren_out3_start) && (proc_cnt < cnt_wren_out3_stop)) ?
+											r_din[0] : 1'b0;
+		//
+	end
+
+		//
+		// Data Mapper
+		//
+	assign r_dout = add32_s_plus_q_sum_out;
+	assign u_dout = {s_half_carry,        s_din[31:1]};
+	assign v_dout = {s_plus_q_half_carry, r_din[31:1]};
+	
+	
+		//
+		// Primary Counter Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
+		else begin
+			if (!rdy)		proc_cnt <= proc_cnt_next;
+			else if (ena)	proc_cnt <= proc_cnt_next;
+		end
+		
+		
+		//
+		// Output Flags
+		//
+	reg	s_is_odd_reg;
+	reg	k_is_nul_reg;
+	
+	assign s_is_odd = s_is_odd_reg;
+	assign k_is_nul = k_is_nul_reg;
+
+	always @(posedge clk)
+		//
+		if (proc_cnt == cnt_calc_flags) begin
+			s_is_odd_reg <= s_din[0];
+			k_is_nul_reg <= (k == {K_NUM_BITS{1'b0}}) ? 1'b1 : 1'b0;
+		end
+
+
+endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_reduce_update.v b/rtl/modular/modular_invertor/helper/modinv_helper_reduce_update.v
new file mode 100644
index 0000000..ea5b854
--- /dev/null
+++ b/rtl/modular/modular_invertor/helper/modinv_helper_reduce_update.v
@@ -0,0 +1,153 @@
+`timescale 1ns / 1ps
+
+module modinv_helper_reduce_update
+	(
+		clk, rst_n,
+		ena, rdy,
+		
+		s_is_odd, k_is_nul,
+		
+		s_addr, s_wren, s_dout,
+		u_addr,                 u_din,
+		v_addr,                 v_din
+	);
+	
+	
+		//
+		// Parameters
+		//
+	parameter BUFFER_NUM_WORDS		= 9;
+	parameter BUFFER_ADDR_BITS		= 4;
+	
+	
+		//
+		// clog2
+		//
+`include "..\modinv_clog2.v"
+	
+	
+		//
+		// Constants
+		//
+	localparam PROC_NUM_CYCLES	= BUFFER_NUM_WORDS + 3;
+	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
+	
+	
+		//
+		// Ports
+		//
+	input		wire									clk;
+	input		wire									rst_n;
+	input		wire									ena;
+	output	wire									rdy;
+
+	input		wire									s_is_odd;
+	input		wire									k_is_nul;
+		
+	output	wire	[BUFFER_ADDR_BITS-1:0]	s_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	u_addr;
+	output	wire	[BUFFER_ADDR_BITS-1:0]	v_addr;
+		
+	output	wire									s_wren;
+		
+	output	wire	[              32-1:0]	s_dout;
+
+	input		wire	[              32-1:0]	u_din;
+	input		wire	[              32-1:0]	v_din;
+		
+	
+		//
+		// Counter
+		//
+	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
+
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
+	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
+																	proc_cnt + 1'b1 : proc_cnt_zero;
+	
+		//
+		// Addresses
+		//
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_in;
+
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_max		= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_next	= (addr_in < addr_in_max) ?
+																		addr_in + 1'b1 : addr_in_zero;
+																		
+	reg	[BUFFER_ADDR_BITS-1:0]	addr_out;
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_max	= BUFFER_NUM_WORDS - 1;
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_zero	= {BUFFER_ADDR_BITS{1'b0}};
+	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_next	= (addr_out < addr_out_max) ?
+																		addr_out + 1'b1 : addr_out_zero;
+																		
+	assign s_addr					= addr_out;
+	assign u_addr					= addr_in;
+	assign v_addr					= addr_in;
+	
+	
+		//
+		// Ready Flag
+		//
+	assign rdy = (proc_cnt == proc_cnt_zero);
+	
+	
+		//
+		// Address Increment Logic
+		//
+	wire	inc_addr_in;
+	wire	inc_addr_out;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_start	= 1;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_stop		= BUFFER_NUM_WORDS;
+	
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out_start	= 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out_stop	= BUFFER_NUM_WORDS + 1;
+
+	assign inc_addr_in  = (proc_cnt >= cnt_inc_addr_in_start)  && (proc_cnt <= cnt_inc_addr_in_stop);
+	assign inc_addr_out = (proc_cnt >= cnt_inc_addr_out_start) && (proc_cnt <= cnt_inc_addr_out_stop);
+	
+	always @(posedge clk) begin
+		//
+		if (inc_addr_in)	addr_in <= addr_in_next;
+		else					addr_in <= addr_in_zero;
+		//
+		if (inc_addr_out)	addr_out <= addr_out_next;
+		else					addr_out <= addr_out_zero;
+		//
+	end
+	
+		//
+		// Write Enable Logic
+		//
+	wire	wren_out;
+
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out_start	= 2;
+	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out_stop		= BUFFER_NUM_WORDS + 1;
+
+	assign wren_out = (proc_cnt >= cnt_wren_out_start) && (proc_cnt <= cnt_wren_out_stop);
+
+	assign s_wren = wren_out && !k_is_nul; //s_wren_allow && !v_eq_1 && !rdy;
+	
+	
+		//
+		// Data Logic
+		//
+	assign s_dout = s_is_odd ? v_din : u_din;
+
+		
+		//
+		// Primary Counter Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
+		else begin
+			if (!rdy)		proc_cnt <= proc_cnt_next;
+			else if (ena)	proc_cnt <= proc_cnt_next;
+		end
+
+
+endmodule
diff --git a/rtl/modular/modular_invertor/modinv_clog2.v b/rtl/modular/modular_invertor/modinv_clog2.v
new file mode 100644
index 0000000..2f7b64d
--- /dev/null
+++ b/rtl/modular/modular_invertor/modinv_clog2.v
@@ -0,0 +1,10 @@
+function	integer clog2;
+	input	integer value;
+			integer result;
+	begin
+		value = value - 1;
+		for (result = 0; value > 0; result = result + 1)
+			value = value >> 1;
+		clog2 = result;
+	end
+endfunction
diff --git a/rtl/modular/modular_invertor/modular_invertor.v b/rtl/modular/modular_invertor/modular_invertor.v
new file mode 100644
index 0000000..e9f2460
--- /dev/null
+++ b/rtl/modular/modular_invertor/modular_invertor.v
@@ -0,0 +1,981 @@
+//------------------------------------------------------------------------------
+//
+// modular_invertor.v
+// -----------------------------------------------------------------------------
+// Modular invertor.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module modular_invertor
+	(
+		clk, rst_n,
+		ena, rdy,
+		a_addr, q_addr, a1_addr, a1_wren,
+		a_din, q_din, a1_dout
+	);
+
+
+		//
+		// Parameters
+		//
+	parameter MAX_OPERAND_WIDTH = 256;
+	
+	
+		//
+		// clog2
+		//
+`include "modinv_clog2.v"
+
+
+		//
+		// More Parameters
+		//
+	localparam OPERAND_NUM_WORDS	= MAX_OPERAND_WIDTH / 32;
+	localparam OPERAND_ADDR_BITS	= clog2(OPERAND_NUM_WORDS);
+	
+	localparam BUFFER_NUM_WORDS	= OPERAND_NUM_WORDS + 1;
+	localparam BUFFER_ADDR_BITS	= clog2(BUFFER_NUM_WORDS);
+	
+	localparam LOOP_NUM_ROUNDS		= 2 * MAX_OPERAND_WIDTH;
+	localparam ROUND_COUNTER_BITS	= clog2(LOOP_NUM_ROUNDS);
+	
+	localparam K_NUM_BITS			= clog2(LOOP_NUM_ROUNDS + 1);
+	
+
+		//
+		// Ports
+		//
+	input		wire									clk;
+	input		wire									rst_n;
+	
+	input		wire									ena;
+	output	wire									rdy;
+	
+	output	wire	[OPERAND_ADDR_BITS-1:0]	a_addr;
+	output	reg	[OPERAND_ADDR_BITS-1:0]	q_addr;
+	output	wire	[OPERAND_ADDR_BITS-1:0]	a1_addr;
+	output	wire									a1_wren;
+	
+	input		wire	[32-1:0]						a_din;
+	input		wire	[32-1:0]						q_din;
+	output	wire	[32-1:0]						a1_dout;
+
+
+		//
+		// "Redundant" Power of 2 (K)
+		//
+	reg	[K_NUM_BITS-1:0]	k;
+
+	
+		//
+		// Buffers
+		//
+	reg	[BUFFER_ADDR_BITS-1:0]	buf_r_wr_addr;
+	reg	[BUFFER_ADDR_BITS-1:0]	buf_r_rd_addr;
+	reg									buf_r_wr_en;
+	reg	[              32-1:0]	buf_r_wr_din;
+	wire	[              32-1:0]	buf_r_wr_dout;
+	wire	[              32-1:0]	buf_r_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_r
+	(	.clk(clk),	
+		.a_addr(buf_r_wr_addr), .a_out(buf_r_wr_dout), .a_wr(buf_r_wr_en), .a_in(buf_r_wr_din),
+		.b_addr(buf_r_rd_addr), .b_out(buf_r_rd_dout)
+	);
+	
+	reg	[BUFFER_ADDR_BITS-1:0]	buf_s_wr_addr;
+	reg	[BUFFER_ADDR_BITS-1:0]	buf_s_rd_addr;
+	reg									buf_s_wr_en;
+	reg	[              32-1:0]	buf_s_wr_din;
+	wire	[              32-1:0]	buf_s_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_s
+	(	.clk(clk),	
+		.a_addr(buf_s_wr_addr), .a_out(),              .a_wr(buf_s_wr_en), .a_in(buf_s_wr_din),
+		.b_addr(buf_s_rd_addr), .b_out(buf_s_rd_dout)
+	);
+	
+	reg	[BUFFER_ADDR_BITS-1:0]	buf_u_wr_addr;
+	reg	[BUFFER_ADDR_BITS-1:0]	buf_u_rd_addr;
+	reg									buf_u_wr_en;
+	reg	[              32-1:0]	buf_u_wr_din;
+	wire	[              32-1:0]	buf_u_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_u
+	(	.clk(clk),	
+		.a_addr(buf_u_wr_addr), .a_out(),              .a_wr(buf_u_wr_en), .a_in(buf_u_wr_din),
+		.b_addr(buf_u_rd_addr), .b_out(buf_u_rd_dout)
+	);
+	
+	reg	[BUFFER_ADDR_BITS-1:0]	buf_v_wr_addr;
+	reg	[BUFFER_ADDR_BITS-1:0]	buf_v_rd_addr;
+	reg									buf_v_wr_en;
+	reg	[              32-1:0]	buf_v_wr_din;
+	wire	[              32-1:0]	buf_v_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_v
+	(	.clk(clk),	
+		.a_addr(buf_v_wr_addr), .a_out(),              .a_wr(buf_v_wr_en), .a_in(buf_v_wr_din),
+		.b_addr(buf_v_rd_addr), .b_out(buf_v_rd_dout)
+	);	
+
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_r_dbl_wr_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_r_dbl_rd_addr;
+	wire									buf_r_dbl_wr_en;
+	wire	[              32-1:0]	buf_r_dbl_wr_din;
+	wire	[              32-1:0]	buf_r_dbl_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_r_dbl
+	(	.clk(clk),	
+		.a_addr(buf_r_dbl_wr_addr), .a_out(),                  .a_wr(buf_r_dbl_wr_en), .a_in(buf_r_dbl_wr_din),
+		.b_addr(buf_r_dbl_rd_addr), .b_out(buf_r_dbl_rd_dout)
+	);
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_s_dbl_wr_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_s_dbl_rd_addr;
+	wire									buf_s_dbl_wr_en;
+	wire	[              32-1:0]	buf_s_dbl_wr_din;
+	wire	[              32-1:0]	buf_s_dbl_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_s_dbl
+	(	.clk(clk),	
+		.a_addr(buf_s_dbl_wr_addr), .a_out(),                  .a_wr(buf_s_dbl_wr_en), .a_in(buf_s_dbl_wr_din),
+		.b_addr(buf_s_dbl_rd_addr), .b_out(buf_s_dbl_rd_dout)
+	);
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_r_plus_s_wr_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_r_plus_s_rd_addr;
+	wire									buf_r_plus_s_wr_en;
+	wire	[              32-1:0]	buf_r_plus_s_wr_din;
+	wire	[              32-1:0]	buf_r_plus_s_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_r_plus_s
+	(	.clk(clk),	
+		.a_addr(buf_r_plus_s_wr_addr), .a_out(),                     .a_wr(buf_r_plus_s_wr_en), .a_in(buf_r_plus_s_wr_din),
+		.b_addr(buf_r_plus_s_rd_addr), .b_out(buf_r_plus_s_rd_dout)
+	);
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_minus_v_wr_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_minus_v_rd_addr;
+	wire									buf_u_minus_v_wr_en;
+	wire	[              32-1:0]	buf_u_minus_v_wr_din;
+	wire	[              32-1:0]	buf_u_minus_v_wr_dout;
+
+	assign buf_u_minus_v_rd_addr = ~buf_u_minus_v_wr_addr;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_u_minus_v
+	(	.clk(clk),	
+		.a_addr(buf_u_minus_v_wr_addr), .a_out(buf_u_minus_v_wr_dout), .a_wr(buf_u_minus_v_wr_en), .a_in(buf_u_minus_v_wr_din),
+		.b_addr(buf_u_minus_v_rd_addr), .b_out()
+	);
+
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_minus_u_wr_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_minus_u_rd_addr;
+	wire									buf_v_minus_u_wr_en;
+	wire	[              32-1:0]	buf_v_minus_u_wr_din;
+	wire	[              32-1:0]	buf_v_minus_u_wr_dout;
+	
+	assign buf_v_minus_u_rd_addr = ~buf_v_minus_u_wr_addr;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_v_minus_u
+	(	.clk(clk),	
+		.a_addr(buf_v_minus_u_wr_addr), .a_out(buf_v_minus_u_wr_dout), .a_wr(buf_v_minus_u_wr_en), .a_in(buf_v_minus_u_wr_din),
+		.b_addr(buf_v_minus_u_rd_addr), .b_out()
+	);
+
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_half_wr_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_half_rd_addr;
+	wire									buf_u_half_wr_en;
+	wire	[              32-1:0]	buf_u_half_wr_din;
+	wire	[              32-1:0]	buf_u_half_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_u_half
+	(	.clk(clk),	
+		.a_addr(buf_u_half_wr_addr), .a_out(),                   .a_wr(buf_u_half_wr_en), .a_in(buf_u_half_wr_din),
+		.b_addr(buf_u_half_rd_addr), .b_out(buf_u_half_rd_dout)
+	);
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_half_wr_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_half_rd_addr;
+	wire									buf_v_half_wr_en;
+	wire	[              32-1:0]	buf_v_half_wr_din;
+	wire	[              32-1:0]	buf_v_half_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_v_half
+	(	.clk(clk),	
+		.a_addr(buf_v_half_wr_addr), .a_out(),                   .a_wr(buf_v_half_wr_en), .a_in(buf_v_half_wr_din),
+		.b_addr(buf_v_half_rd_addr), .b_out(buf_v_half_rd_dout)
+	);
+	
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_minus_v_half_wr_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_minus_v_half_rd_addr;
+	wire									buf_u_minus_v_half_wr_en;
+	wire	[              32-1:0]	buf_u_minus_v_half_wr_din;
+	wire	[              32-1:0]	buf_u_minus_v_half_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_u_minus_v_half
+	(	.clk(clk),	
+		.a_addr(buf_u_minus_v_half_wr_addr), .a_out(),                           .a_wr(buf_u_minus_v_half_wr_en), .a_in(buf_u_minus_v_half_wr_din),
+		.b_addr(buf_u_minus_v_half_rd_addr), .b_out(buf_u_minus_v_half_rd_dout)
+	);
+
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_minus_u_half_wr_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_minus_u_half_rd_addr;
+	wire									buf_v_minus_u_half_wr_en;
+	wire	[              32-1:0]	buf_v_minus_u_half_wr_din;
+	wire	[              32-1:0]	buf_v_minus_u_half_rd_dout;
+
+	bram_1rw_1ro_readfirst #
+	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
+	)
+	buf_v_minus_u_half
+	(	.clk(clk),	
+		.a_addr(buf_v_minus_u_half_wr_addr), .a_out(),                           .a_wr(buf_v_minus_u_half_wr_en), .a_in(buf_v_minus_u_half_wr_din),
+		.b_addr(buf_v_minus_u_half_rd_addr), .b_out(buf_v_minus_u_half_rd_dout)
+	);
+
+
+		//
+		// Helper Modules
+		//
+	wire helper_init_ena;
+	wire helper_invert_precalc_ena;
+	wire helper_invert_compare_ena;
+	wire helper_invert_update_ena;
+	wire helper_reduce_precalc_ena;
+	wire helper_reduce_update_ena;
+	wire helper_copy_ena;
+	
+	wire helper_init_rdy;
+	wire helper_invert_precalc_rdy;
+	wire helper_invert_compare_rdy;
+	wire helper_invert_update_rdy;
+	wire helper_reduce_precalc_rdy;
+	wire helper_reduce_update_rdy;
+	wire helper_copy_rdy;
+	
+	wire helper_init_done				= helper_init_rdy           && !helper_init_ena;
+	wire helper_invert_precalc_done	= helper_invert_precalc_rdy && !helper_invert_precalc_ena;
+	wire helper_invert_compare_done	= helper_invert_compare_rdy && !helper_invert_compare_ena;
+	wire helper_invert_update_done	= helper_invert_update_rdy  && !helper_invert_update_ena;
+	wire helper_reduce_precalc_done	= helper_reduce_precalc_rdy && !helper_reduce_precalc_ena;
+	wire helper_reduce_update_done	= helper_reduce_update_rdy  && !helper_reduce_update_ena;
+	wire helper_copy_done				= helper_copy_rdy           && !helper_copy_ena;
+	
+	
+		//
+		// Helper Module - Initialization
+		//
+	wire	[ BUFFER_ADDR_BITS-1:0]	helper_init_r_addr;
+	wire	[ BUFFER_ADDR_BITS-1:0]	helper_init_s_addr;
+	wire	[ BUFFER_ADDR_BITS-1:0]	helper_init_u_addr;
+	wire	[ BUFFER_ADDR_BITS-1:0]	helper_init_v_addr;
+	wire	[OPERAND_ADDR_BITS-1:0]	helper_init_q_addr;
+	
+	wire									helper_init_r_wren;
+	wire									helper_init_s_wren;
+	wire									helper_init_u_wren;
+	wire									helper_init_v_wren;
+	
+	wire	[              32-1:0]	helper_init_r_data;
+	wire	[              32-1:0]	helper_init_s_data;
+	wire	[              32-1:0]	helper_init_u_data;
+	wire	[              32-1:0]	helper_init_v_data;
+	
+	modinv_helper_init #
+	(
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
+		.OPERAND_ADDR_BITS	(OPERAND_ADDR_BITS),
+	
+		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
+		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
+	)
+	helper_init
+	(
+		.clk 		(clk),
+		.rst_n	(rst_n),
+		
+		.ena 		(helper_init_ena),
+		.rdy 		(helper_init_rdy),
+		
+		.a_addr	(a_addr),
+		.q_addr	(helper_init_q_addr),
+		
+		.r_addr	(helper_init_r_addr),
+		.s_addr	(helper_init_s_addr),
+		.u_addr	(helper_init_u_addr),
+		.v_addr	(helper_init_v_addr),
+		
+		.q_din	(q_din),
+		.a_din	(a_din),
+		
+		.r_dout	(helper_init_r_data),
+		.s_dout	(helper_init_s_data),
+		.u_dout	(helper_init_u_data),
+		.v_dout	(helper_init_v_data),
+		
+		.r_wren	(helper_init_r_wren),
+		.s_wren	(helper_init_s_wren),
+		.u_wren	(helper_init_u_wren),
+		.v_wren	(helper_init_v_wren)
+	);
+	
+	
+		//
+		// Helper Module - Inversion Pre-Calculation
+		//
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_precalc_r_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_precalc_s_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_precalc_u_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_precalc_v_addr;
+	
+	modinv_helper_invert_precalc #
+	(
+		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
+		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
+	)
+	helper_invert_precalc
+	(
+		.clk 							(clk),
+		.rst_n						(rst_n),
+		
+		.ena 							(helper_invert_precalc_ena),
+		.rdy 							(helper_invert_precalc_rdy),
+		
+		.r_addr						(helper_invert_precalc_r_addr),
+		.s_addr						(helper_invert_precalc_s_addr),
+		.u_addr						(helper_invert_precalc_u_addr),
+		.v_addr						(helper_invert_precalc_v_addr),
+		
+		.r_din						(buf_r_rd_dout),
+		.s_din						(buf_s_rd_dout),
+		.u_din						(buf_u_rd_dout),
+		.v_din						(buf_v_rd_dout),
+		
+		.r_dbl_addr					(buf_r_dbl_wr_addr),
+		.s_dbl_addr					(buf_s_dbl_wr_addr),
+		.r_plus_s_addr				(buf_r_plus_s_wr_addr),
+		
+		.u_half_addr				(buf_u_half_wr_addr),
+		.v_half_addr				(buf_v_half_wr_addr),
+		.u_minus_v_addr			(buf_u_minus_v_wr_addr),
+		.v_minus_u_addr			(buf_v_minus_u_wr_addr),
+		.u_minus_v_half_addr		(buf_u_minus_v_half_wr_addr),
+		.v_minus_u_half_addr		(buf_v_minus_u_half_wr_addr),
+		
+		.r_dbl_dout					(buf_r_dbl_wr_din),
+		.s_dbl_dout					(buf_s_dbl_wr_din),
+		.r_plus_s_dout				(buf_r_plus_s_wr_din),
+		
+		.u_half_dout				(buf_u_half_wr_din),
+		.v_half_dout				(buf_v_half_wr_din),
+		.u_minus_v_dout			(buf_u_minus_v_wr_din),
+		.v_minus_u_dout			(buf_v_minus_u_wr_din),
+		.u_minus_v_half_dout		(buf_u_minus_v_half_wr_din),
+		.v_minus_u_half_dout		(buf_v_minus_u_half_wr_din),
+		
+		.r_dbl_wren					(buf_r_dbl_wr_en),
+		.s_dbl_wren					(buf_s_dbl_wr_en),
+		.r_plus_s_wren				(buf_r_plus_s_wr_en),
+		
+		.u_half_wren				(buf_u_half_wr_en),
+		.v_half_wren				(buf_v_half_wr_en),
+		.u_minus_v_wren			(buf_u_minus_v_wr_en),
+		.v_minus_u_wren			(buf_v_minus_u_wr_en),
+		.u_minus_v_half_wren		(buf_u_minus_v_half_wr_en),
+		.v_minus_u_half_wren		(buf_v_minus_u_half_wr_en),
+		
+		.u_minus_v_din				(buf_u_minus_v_wr_dout),
+		.v_minus_u_din				(buf_v_minus_u_wr_dout)
+	);
+	
+	
+		//
+		// Helper Module - Inversion Comparison
+		//
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_compare_u_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_compare_v_addr;
+
+	wire	flag_invert_u_gt_v;
+	wire	flag_invert_v_eq_1;
+	wire	flag_invert_u_is_even;
+	wire	flag_invert_v_is_even;
+
+	modinv_helper_invert_compare #
+	(
+		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
+		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
+	)
+	helper_invert_compare
+	(
+		.clk 			(clk),
+		.rst_n		(rst_n),
+		
+		.ena 			(helper_invert_compare_ena),
+		.rdy 			(helper_invert_compare_rdy),
+				
+		.u_addr		(helper_invert_compare_u_addr),
+		.v_addr		(helper_invert_compare_v_addr),
+		
+		.u_din		(buf_u_rd_dout),
+		.v_din		(buf_v_rd_dout),
+		
+		.u_gt_v		(flag_invert_u_gt_v),
+		.v_eq_1		(flag_invert_v_eq_1),
+		.u_is_even	(flag_invert_u_is_even),
+		.v_is_even	(flag_invert_v_is_even)
+	);
+	
+		
+		//
+		// Helper Module - Inversion Update
+		//
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_update_r_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_update_s_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_update_u_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_update_v_addr;
+	
+	wire									helper_invert_update_r_wren;
+	wire									helper_invert_update_s_wren;
+	wire									helper_invert_update_u_wren;
+	wire									helper_invert_update_v_wren;
+	
+	wire	[              32-1:0]	helper_invert_update_r_data;
+	wire	[              32-1:0]	helper_invert_update_s_data;
+	wire	[              32-1:0]	helper_invert_update_u_data;
+	wire	[              32-1:0]	helper_invert_update_v_data;
+	
+	modinv_helper_invert_update #
+	(
+		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
+		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
+	)
+	helper_invert_update
+	(
+		.clk 							(clk),
+		.rst_n						(rst_n),
+		
+		.ena 							(helper_invert_update_ena),
+		.rdy 							(helper_invert_update_rdy),
+		
+		.u_gt_v						(flag_invert_u_gt_v),
+		.v_eq_1						(flag_invert_v_eq_1),
+		.u_is_even					(flag_invert_u_is_even),
+		.v_is_even					(flag_invert_v_is_even),
+		
+		.r_addr						(helper_invert_update_r_addr),
+		.s_addr						(helper_invert_update_s_addr),
+		.u_addr						(helper_invert_update_u_addr),
+		.v_addr						(helper_invert_update_v_addr),
+		
+		.r_wren						(helper_invert_update_r_wren),
+		.s_wren						(helper_invert_update_s_wren),
+		.u_wren						(helper_invert_update_u_wren),
+		.v_wren						(helper_invert_update_v_wren),
+		
+		.r_dout						(helper_invert_update_r_data),
+		.s_dout						(helper_invert_update_s_data),
+		.u_dout						(helper_invert_update_u_data),
+		.v_dout						(helper_invert_update_v_data),
+		
+		.r_dbl_addr					(buf_r_dbl_rd_addr),
+		.s_dbl_addr					(buf_s_dbl_rd_addr),
+		.r_plus_s_addr				(buf_r_plus_s_rd_addr),
+		.u_half_addr				(buf_u_half_rd_addr),
+		.v_half_addr				(buf_v_half_rd_addr),
+		.u_minus_v_half_addr		(buf_u_minus_v_half_rd_addr),
+		.v_minus_u_half_addr		(buf_v_minus_u_half_rd_addr),
+		
+		.r_dbl_din					(buf_r_dbl_rd_dout),
+		.s_dbl_din					(buf_s_dbl_rd_dout),
+		.r_plus_s_din				(buf_r_plus_s_rd_dout),
+		.u_half_din					(buf_u_half_rd_dout),
+		.v_half_din					(buf_v_half_rd_dout),
+		.u_minus_v_half_din		(buf_u_minus_v_half_rd_dout),
+		.v_minus_u_half_din		(buf_v_minus_u_half_rd_dout)
+	);
+	
+	
+		//
+		// Helper Module - Reduction Pre-Calculation
+		//
+	wire	[ BUFFER_ADDR_BITS-1:0]	helper_reduce_precalc_r_addr;
+	wire	[ BUFFER_ADDR_BITS-1:0]	helper_reduce_precalc_s_addr;
+	wire	[ BUFFER_ADDR_BITS-1:0]	helper_reduce_precalc_u_addr;
+	wire	[ BUFFER_ADDR_BITS-1:0]	helper_reduce_precalc_v_addr;
+	wire	[OPERAND_ADDR_BITS-1:0]	helper_reduce_precalc_q_addr;
+	
+	wire									helper_reduce_precalc_r_wren;
+	wire									helper_reduce_precalc_u_wren;
+	wire									helper_reduce_precalc_v_wren;
+	
+	wire	[              32-1:0]	helper_reduce_precalc_r_data;
+	wire	[              32-1:0]	helper_reduce_precalc_u_data;
+	wire	[              32-1:0]	helper_reduce_precalc_v_data;
+
+	wire	flag_reduce_s_is_odd;
+	wire	flag_invert_k_is_nul;
+	
+	modinv_helper_reduce_precalc #
+	(
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
+		.OPERAND_ADDR_BITS	(OPERAND_ADDR_BITS),
+		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
+		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS),
+		.K_NUM_BITS				(K_NUM_BITS)
+	)
+	helper_reduce_precalc
+	(
+		.clk 			(clk),
+		.rst_n		(rst_n),
+		
+		.ena 			(helper_reduce_precalc_ena),
+		.rdy 			(helper_reduce_precalc_rdy),
+		
+		.r_addr		(helper_reduce_precalc_r_addr),
+		.s_addr		(helper_reduce_precalc_s_addr),
+		.u_addr		(helper_reduce_precalc_u_addr),
+		.v_addr		(helper_reduce_precalc_v_addr),
+		.q_addr		(helper_reduce_precalc_q_addr),
+		
+		.k				(k),
+		
+		.s_is_odd	(flag_reduce_s_is_odd),
+		.k_is_nul	(flag_reduce_k_is_nul),
+		
+		.r_din		(buf_r_wr_dout),
+		.s_din		(buf_s_rd_dout),
+		.q_din		(q_din),
+		
+		.r_wren		(helper_reduce_precalc_r_wren),
+		.u_wren		(helper_reduce_precalc_u_wren),
+		.v_wren		(helper_reduce_precalc_v_wren),
+		
+		.r_dout		(helper_reduce_precalc_r_data),
+		.u_dout		(helper_reduce_precalc_u_data),
+		.v_dout		(helper_reduce_precalc_v_data)
+	);
+	
+		//
+		// Helper Module - Reduction Update
+		//
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_reduce_update_s_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_reduce_update_u_addr;
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_reduce_update_v_addr;
+	
+	wire									helper_reduce_update_s_wren;
+	
+	wire	[              32-1:0]	helper_reduce_update_s_data;
+	
+	modinv_helper_reduce_update #
+	(
+		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
+		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
+	)
+	helper_reduce_update
+	(
+		.clk 							(clk),
+		.rst_n						(rst_n),
+		
+		.ena 							(helper_reduce_update_ena),
+		.rdy 							(helper_reduce_update_rdy),
+		
+		.s_is_odd					(flag_reduce_s_is_odd),
+		.k_is_nul					(flag_reduce_k_is_nul),
+		
+		.s_addr						(helper_reduce_update_s_addr),
+		.u_addr						(helper_reduce_update_u_addr),
+		.v_addr						(helper_reduce_update_v_addr),
+		
+		.s_wren						(helper_reduce_update_s_wren),
+		
+		.s_dout						(helper_reduce_update_s_data),
+				
+		.u_din						(buf_u_rd_dout),
+		.v_din						(buf_v_rd_dout)
+	);
+	
+	
+		//
+		// Helper Module - Copying
+		//
+	wire	[BUFFER_ADDR_BITS-1:0]	helper_copy_s_addr;
+		
+	modinv_helper_copy #
+	(
+		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
+		.OPERAND_ADDR_BITS	(OPERAND_ADDR_BITS),
+	
+		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
+		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
+	)
+	helper_copy
+	(
+		.clk 		(clk),
+		.rst_n	(rst_n),
+		
+		.ena 		(helper_copy_ena),
+		.rdy 		(helper_copy_rdy),
+		
+		.s_addr	(helper_copy_s_addr),
+		.a1_addr	(a1_addr),
+		
+		.s_din	(buf_s_rd_dout),
+		
+		.a1_dout	(a1_dout),
+		
+		.a1_wren	(a1_wren)
+	);
+	
+	
+		//
+		// Round Counter
+		//
+	reg	[ROUND_COUNTER_BITS-1:0]	round_counter;
+	wire	[ROUND_COUNTER_BITS-1:0]	round_counter_max = LOOP_NUM_ROUNDS - 1;
+	wire	[ROUND_COUNTER_BITS-1:0]	round_counter_zero = {ROUND_COUNTER_BITS{1'b0}};
+	wire	[ROUND_COUNTER_BITS-1:0]	round_counter_next =
+		(round_counter < round_counter_max) ? round_counter + 1'b1 : round_counter_zero;
+
+	
+		//
+		// FSM
+		//
+	localparam FSM_STATE_IDLE				= 4'd0;
+	
+	localparam FSM_STATE_INIT				= 4'd1;
+	
+	localparam FSM_STATE_INVERT_PRECALC	= 4'd11;
+	localparam FSM_STATE_INVERT_COMPARE	= 4'd12;
+	localparam FSM_STATE_INVERT_UPDATE	= 4'd13;
+	
+	localparam FSM_STATE_REDUCE_PRECALC	= 4'd14;
+	localparam FSM_STATE_REDUCE_UPDATE	= 4'd15;
+	
+	localparam FSM_STATE_COPY				= 4'd2;
+	
+	localparam FSM_STATE_DONE				= 4'd3;
+	
+	reg [3:0] fsm_state = FSM_STATE_IDLE;
+	reg [3:0] fsm_state_dly = FSM_STATE_IDLE;
+	
+	wire fsm_state_new = (fsm_state != fsm_state_dly);
+
+	wire [3:0] fsm_state_invert_next = (round_counter < round_counter_max) ?
+		FSM_STATE_INVERT_PRECALC : FSM_STATE_REDUCE_PRECALC;
+		
+	wire [3:0] fsm_state_reduce_next = (round_counter < round_counter_max) ?
+		FSM_STATE_REDUCE_PRECALC : FSM_STATE_COPY;
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+		else case (fsm_state)
+			FSM_STATE_IDLE:				fsm_state <= ena                        ? FSM_STATE_INIT           : FSM_STATE_IDLE;
+			FSM_STATE_INIT:				fsm_state <= helper_init_done           ? FSM_STATE_INVERT_PRECALC : FSM_STATE_INIT;
+			FSM_STATE_INVERT_PRECALC:	fsm_state <= helper_invert_precalc_done ? FSM_STATE_INVERT_COMPARE : FSM_STATE_INVERT_PRECALC;
+			FSM_STATE_INVERT_COMPARE:	fsm_state <= helper_invert_compare_done ? FSM_STATE_INVERT_UPDATE  : FSM_STATE_INVERT_COMPARE;
+			FSM_STATE_INVERT_UPDATE:	fsm_state <= helper_invert_update_done  ? fsm_state_invert_next    : FSM_STATE_INVERT_UPDATE;
+			FSM_STATE_REDUCE_PRECALC:	fsm_state <= helper_reduce_precalc_done ? FSM_STATE_REDUCE_UPDATE  : FSM_STATE_REDUCE_PRECALC;
+			FSM_STATE_REDUCE_UPDATE:	fsm_state <= helper_reduce_update_done  ? fsm_state_reduce_next    : FSM_STATE_REDUCE_UPDATE;
+			FSM_STATE_COPY:				fsm_state <= helper_copy_done           ? FSM_STATE_DONE           : FSM_STATE_COPY;
+			FSM_STATE_DONE:				fsm_state <= FSM_STATE_IDLE;
+			default:							fsm_state <= FSM_STATE_IDLE;
+		endcase
+		
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	fsm_state_dly <= FSM_STATE_IDLE;
+		else						fsm_state_dly <= fsm_state;
+
+
+	assign helper_init_ena				= (fsm_state == FSM_STATE_INIT)           && fsm_state_new;
+	assign helper_invert_precalc_ena	= (fsm_state == FSM_STATE_INVERT_PRECALC) && fsm_state_new;
+	assign helper_invert_compare_ena	= (fsm_state == FSM_STATE_INVERT_COMPARE) && fsm_state_new;
+	assign helper_invert_update_ena	= (fsm_state == FSM_STATE_INVERT_UPDATE)  && fsm_state_new;
+	assign helper_reduce_precalc_ena	= (fsm_state == FSM_STATE_REDUCE_PRECALC) && fsm_state_new;
+	assign helper_reduce_update_ena	= (fsm_state == FSM_STATE_REDUCE_UPDATE)  && fsm_state_new;
+	assign helper_copy_ena				= (fsm_state == FSM_STATE_COPY)           && fsm_state_new;
+	
+	
+		//
+		// Counter Increment
+		//
+	always @(posedge clk) begin
+		//
+		if ((fsm_state == FSM_STATE_INIT) && helper_init_done)
+			round_counter <= round_counter_zero;
+		//	
+		if ((fsm_state == FSM_STATE_INVERT_UPDATE) && helper_invert_update_done)
+			round_counter <= round_counter_next;
+		//
+		if ((fsm_state == FSM_STATE_REDUCE_UPDATE) && helper_reduce_update_done)
+			round_counter <= round_counter_next;
+		//
+	end
+		
+		
+		//
+		// Q Address Selector
+		//
+	always @(*) begin
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				q_addr = helper_init_q_addr;
+			FSM_STATE_REDUCE_PRECALC:	q_addr = helper_reduce_precalc_q_addr;
+			default:							q_addr = {OPERAND_ADDR_BITS{1'bX}};
+		endcase
+		//
+	end
+	
+	
+		//
+		// Buffer Address Selector
+		//
+	always @(*) begin
+		//
+		// Write Ports
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_r_wr_addr = helper_init_r_addr;
+			FSM_STATE_INVERT_UPDATE:	buf_r_wr_addr = helper_invert_update_r_addr;
+			FSM_STATE_REDUCE_PRECALC:	buf_r_wr_addr = helper_reduce_precalc_r_addr;
+			default:							buf_r_wr_addr = {BUFFER_ADDR_BITS{1'bX}};
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_s_wr_addr = helper_init_s_addr;
+			FSM_STATE_INVERT_UPDATE:	buf_s_wr_addr = helper_invert_update_s_addr;
+			FSM_STATE_REDUCE_UPDATE:	buf_s_wr_addr = helper_reduce_update_s_addr;
+			default:							buf_s_wr_addr = {BUFFER_ADDR_BITS{1'bX}};
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_u_wr_addr = helper_init_u_addr;
+			FSM_STATE_INVERT_UPDATE:	buf_u_wr_addr = helper_invert_update_u_addr;
+			FSM_STATE_REDUCE_PRECALC:	buf_u_wr_addr = helper_reduce_precalc_u_addr;
+			default:							buf_u_wr_addr = {BUFFER_ADDR_BITS{1'bX}};
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_v_wr_addr = helper_init_v_addr;
+			FSM_STATE_INVERT_UPDATE:	buf_v_wr_addr = helper_invert_update_v_addr;
+			FSM_STATE_REDUCE_PRECALC:	buf_v_wr_addr = helper_reduce_precalc_v_addr;
+			default:							buf_v_wr_addr = {BUFFER_ADDR_BITS{1'bX}};
+		endcase
+		//
+		// Read Ports
+		//
+		case (fsm_state)
+			FSM_STATE_INVERT_PRECALC:	buf_r_rd_addr = helper_invert_precalc_r_addr;
+			default:							buf_r_rd_addr = {BUFFER_ADDR_BITS{1'bX}};
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INVERT_PRECALC:	buf_s_rd_addr = helper_invert_precalc_s_addr;
+			FSM_STATE_REDUCE_PRECALC:	buf_s_rd_addr = helper_reduce_precalc_s_addr;
+			FSM_STATE_COPY:				buf_s_rd_addr = helper_copy_s_addr;
+			default:							buf_s_rd_addr = {BUFFER_ADDR_BITS{1'bX}};
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INVERT_PRECALC:	buf_u_rd_addr = helper_invert_precalc_u_addr;
+			FSM_STATE_INVERT_COMPARE:	buf_u_rd_addr = helper_invert_compare_u_addr;
+			FSM_STATE_REDUCE_UPDATE:	buf_u_rd_addr = helper_reduce_update_u_addr;
+			default:							buf_u_rd_addr = {BUFFER_ADDR_BITS{1'bX}};
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INVERT_PRECALC:	buf_v_rd_addr = helper_invert_precalc_v_addr;
+			FSM_STATE_INVERT_COMPARE:	buf_v_rd_addr = helper_invert_compare_v_addr;
+			FSM_STATE_REDUCE_UPDATE:	buf_v_rd_addr = helper_reduce_update_v_addr;
+			default:							buf_v_rd_addr = {BUFFER_ADDR_BITS{1'bX}};
+		endcase
+		//
+	end
+	
+	
+		//
+		// Buffer Write Enable Logic
+		//
+	always @(*) begin
+		//
+		// Write Ports
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_r_wr_en  = helper_init_r_wren;
+			FSM_STATE_INVERT_UPDATE:	buf_r_wr_en = helper_invert_update_r_wren;
+			FSM_STATE_REDUCE_PRECALC:	buf_r_wr_en = helper_reduce_precalc_r_wren;
+			default:							buf_r_wr_en = 1'b0;
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_s_wr_en = helper_init_s_wren;
+			FSM_STATE_INVERT_UPDATE:	buf_s_wr_en = helper_invert_update_s_wren;
+			FSM_STATE_REDUCE_UPDATE:	buf_s_wr_en = helper_reduce_update_s_wren;
+			default:							buf_s_wr_en = 1'b0;
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_u_wr_en = helper_init_u_wren;
+			FSM_STATE_INVERT_UPDATE:	buf_u_wr_en = helper_invert_update_u_wren;
+			FSM_STATE_REDUCE_PRECALC:	buf_u_wr_en = helper_reduce_precalc_u_wren;
+			default:							buf_u_wr_en = 1'b0;
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_v_wr_en = helper_init_v_wren;
+			FSM_STATE_INVERT_UPDATE:	buf_v_wr_en = helper_invert_update_v_wren;
+			FSM_STATE_REDUCE_PRECALC:	buf_v_wr_en = helper_reduce_precalc_v_wren;
+			default:							buf_v_wr_en = 1'b0;
+		endcase
+		//
+	end
+	
+	
+		//
+		// Buffer Write Data Selector
+		//
+	always @(*) begin
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_r_wr_din = helper_init_r_data;
+			FSM_STATE_INVERT_UPDATE:	buf_r_wr_din = helper_invert_update_r_data;
+			FSM_STATE_REDUCE_PRECALC:	buf_r_wr_din = helper_reduce_precalc_r_data;
+			default:							buf_r_wr_din = {32{1'bX}};
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_s_wr_din = helper_init_s_data;
+			FSM_STATE_INVERT_UPDATE:	buf_s_wr_din = helper_invert_update_s_data;
+			FSM_STATE_REDUCE_UPDATE:	buf_s_wr_din = helper_reduce_update_s_data;
+			default:							buf_s_wr_din = {32{1'bX}};
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_u_wr_din = helper_init_u_data;
+			FSM_STATE_INVERT_UPDATE:	buf_u_wr_din = helper_invert_update_u_data;
+			FSM_STATE_REDUCE_PRECALC:	buf_u_wr_din = helper_reduce_precalc_u_data;
+			default:							buf_u_wr_din = {32{1'bX}};
+		endcase
+		//
+		case (fsm_state)
+			FSM_STATE_INIT:				buf_v_wr_din = helper_init_v_data;
+			FSM_STATE_INVERT_UPDATE:	buf_v_wr_din = helper_invert_update_v_data;
+			FSM_STATE_REDUCE_PRECALC:	buf_v_wr_din = helper_reduce_precalc_v_data;
+			default:							buf_v_wr_din = {32{1'bX}};
+		endcase
+		//
+	end
+	
+	
+		//
+		// Ready Logic
+		//
+	reg rdy_reg = 1'b1;
+
+	assign rdy = rdy_reg;
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0) rdy_reg <= 1'b1;
+		else begin
+		
+				/* clear */
+			if (rdy && ena) rdy_reg <= 1'b0;
+			
+				/* set */
+			if (!rdy && (fsm_state == FSM_STATE_DONE)) rdy_reg <= 1'b1;
+			
+		end
+	
+		
+		//
+		// Store Redundant Power of 2 (K)
+		//
+	always @(posedge clk)
+		//
+		if (helper_init_ena)
+			k <= {K_NUM_BITS{1'b0}};
+		else begin
+		
+			if (helper_invert_update_ena && !flag_invert_v_eq_1)
+				k <= k + 1'b1;
+				
+			if (helper_reduce_update_ena && (k != {K_NUM_BITS{1'b0}}))
+				k <= k - 1'b1;
+				
+		end
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/modular/modular_multiplier_256.v b/rtl/modular/modular_multiplier_256.v
new file mode 100644
index 0000000..8487aee
--- /dev/null
+++ b/rtl/modular/modular_multiplier_256.v
@@ -0,0 +1,402 @@
+//------------------------------------------------------------------------------
+//
+// modular_multiplier_256.v
+// -----------------------------------------------------------------------------
+// Modular multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module modular_multiplier_256
+	(
+		clk, rst_n,
+		ena, rdy,
+		a_addr, b_addr, n_addr, p_addr, p_wren,
+		a_din, b_din, n_din, p_dout
+	);
+	
+	
+		//
+		// Constants
+		//
+	localparam	OPERAND_NUM_WORDS					= 8;
+	localparam	WORD_COUNTER_WIDTH				= 3;
+	
+	
+		//
+		// Handy Numbers
+		//
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_ZERO	= 0;
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_LAST	= OPERAND_NUM_WORDS - 1;
+	
+	
+		//
+		// Handy Functions
+		//
+	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_NEXT_OR_ZERO;
+		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
+		begin
+			WORD_INDEX_NEXT_OR_ZERO = (WORD_INDEX_CURRENT < WORD_INDEX_LAST) ?
+				WORD_INDEX_CURRENT + 1'b1 : WORD_INDEX_ZERO;
+		end
+	endfunction
+	
+	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_PREVIOUS_OR_LAST;
+		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
+		begin
+			WORD_INDEX_PREVIOUS_OR_LAST = (WORD_INDEX_CURRENT > WORD_INDEX_ZERO) ?
+				WORD_INDEX_CURRENT - 1'b1 : WORD_INDEX_LAST;
+		end
+	endfunction
+	
+	
+		//
+		// Ports
+		//
+	input		wire										clk;		// system clock
+	input		wire										rst_n;	// active-low async reset
+	
+	input		wire										ena;		// enable input
+	output	wire										rdy;		// ready output
+	
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	a_addr;	// index of current A word
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	b_addr;	// index of current B word
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	n_addr;	// index of current N word
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	p_addr;	// index of current P word
+	output	wire										p_wren;	// store current P word now	
+	
+	input		wire	[                  31:0]	a_din;	// A
+	input		wire	[                  31:0]	b_din;	// B
+	input		wire	[                  31:0]	n_din;	// N (must be P-256!)
+	output	wire	[                  31:0]	p_dout;	// P = A * B mod N
+	
+	
+		//
+		// Word Indices
+		//
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_a;
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_b;
+		
+		/* map registers to output ports */
+	assign a_addr	= index_a;
+	assign b_addr	= index_b;
+	
+		//
+		// FSM
+		//
+	localparam	FSM_SHREG_WIDTH	= (1 * OPERAND_NUM_WORDS + 1) + (2 * OPERAND_NUM_WORDS + 1) + (2 * OPERAND_NUM_WORDS + 2) + (0 * OPERAND_NUM_WORDS + 2) + 1;
+	
+	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
+	
+	assign rdy = fsm_shreg[0];
+	
+	wire [1 * OPERAND_NUM_WORDS-1:0]	fsm_shreg_inc_index_a	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 0)];
+	wire [1 * OPERAND_NUM_WORDS-1:0]	fsm_shreg_store_word_a	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1)];
+	wire [2 * OPERAND_NUM_WORDS-1:0]	fsm_shreg_inc_index_b	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 1)];
+	wire [2 * OPERAND_NUM_WORDS-2:0]	fsm_shreg_store_si_msb	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 1)];
+	wire [0 * OPERAND_NUM_WORDS-0:0] fsm_shreg_store_si_lsb	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 2)];
+	wire [2 * OPERAND_NUM_WORDS-2:0]	fsm_shreg_shift_si		= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 3) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 1)];
+	wire [0 * OPERAND_NUM_WORDS-0:0]	fsm_shreg_mask_cw1_sum	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 4)];
+	wire [2 * OPERAND_NUM_WORDS-1:0]	fsm_shreg_store_c_word	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 4)];
+	wire [0 * OPERAND_NUM_WORDS-0:0]	fsm_shreg_reduce_start	= fsm_shreg[FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 5)];
+	wire [0 * OPERAND_NUM_WORDS-0:0]	fsm_shreg_reduce_stop	= fsm_shreg[FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 6) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 6)];
+	
+	wire inc_index_a		= |fsm_shreg_inc_index_a;
+	wire store_word_a		= |fsm_shreg_store_word_a;
+	wire inc_index_b		= |fsm_shreg_inc_index_b;
+	wire clear_mac_ab		= |fsm_shreg_inc_index_b;
+	wire shift_wide_a		= |fsm_shreg_inc_index_b;
+	wire enable_mac_ab	= |fsm_shreg_inc_index_b;
+	wire store_si_msb		= |fsm_shreg_store_si_msb;
+	wire store_si_lsb		=  fsm_shreg_store_si_lsb;
+	wire shift_si			= |fsm_shreg_shift_si;
+	wire mask_cw1_sum		=  fsm_shreg_mask_cw1_sum;
+	wire store_c_word		= |fsm_shreg_store_c_word;
+	wire reduce_start		=  fsm_shreg_reduce_start;
+	wire reduce_stop		=  fsm_shreg_reduce_stop;
+	
+	
+		//
+		// FSM Logic
+		//
+	wire	reduce_done;
+		
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)
+			//
+			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
+			//
+		else begin
+			//
+			if (rdy)
+				fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
+			//
+			else if (!reduce_stop || reduce_done)
+				fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
+			//
+		end
+	
+		
+		//
+		// Word Index Increment Logic
+		//
+	reg	index_b_ff;
+	
+	always @(posedge clk)
+		//
+		if (inc_index_b) index_b_ff <= ~index_b_ff;
+		else index_b_ff <= 1'b0;
+	
+	always @(posedge clk)
+		//
+		if (rdy) begin
+			//
+			index_a		<= WORD_INDEX_ZERO;
+			index_b		<= WORD_INDEX_LAST;
+			//
+		end else begin
+			//
+			if (inc_index_a)						index_a	<= WORD_INDEX_NEXT_OR_ZERO(index_a);
+			if (inc_index_b && !index_b_ff)	index_b	<= WORD_INDEX_PREVIOUS_OR_LAST(index_b);
+			//
+		end
+		
+		
+		//
+		// Wide Operand Buffer
+		//
+	reg	[255:0]	buf_a_wide;
+	
+	always @(posedge clk)
+		//
+		if (store_word_a)
+			buf_a_wide <= {buf_a_wide[16 +: 256 - 3 * 16], {a_din[15:0], a_din[31:16]}, buf_a_wide[256 - 2 * 16 +: 16]};
+		else if (shift_wide_a)
+			buf_a_wide <= {buf_a_wide[256-(16+1):0], buf_a_wide[256-16+:16]};
+		
+		
+		//
+		// Multiplier Array
+		//
+	wire	mac_inhibit;			// control signal to pause all accumulators
+	
+	wire	[46: 0]	mac[0:15];	// outputs of all accumulators
+	reg	[15: 0]	mac_clear;	// individual per-accumulator clear flag
+	
+	assign mac_inhibit = ~enable_mac_ab;
+	
+	always @(posedge clk)
+		//
+		if (!clear_mac_ab)
+			mac_clear <= {16{1'b1}};
+		else begin
+		
+			if (mac_clear == {16{1'b1}})
+				mac_clear <= {{14{1'b0}}, 1'b1, {1{1'b0}}};
+			else
+				mac_clear <= (mac_clear[15] == 1'b0) ? {mac_clear[14:0], 1'b0} : {16{1'b1}};
+				
+		
+		end
+	
+		//
+		// Array of parallel multipliers
+		//
+	genvar i;
+	generate for (i=0; i<16; i=i+1)
+		begin : gen_mac_array
+			//
+			mac16_wrapper mac16_inst
+			(
+				.clk		(clk),
+				.ce		(~mac_inhibit),
+				
+				.clr		(mac_clear[i]),
+				
+				.a			(buf_a_wide[16*i+:16]),
+				.b			(index_b_ff ? b_din[15:0] : b_din[31:16]),
+				.s			(mac[i])
+			);
+			//
+		end
+	endgenerate
+	
+		//
+		// Intermediate Words
+		//
+	reg	[47*(2*OPERAND_NUM_WORDS-1)-1:0]	si_msb;
+	reg	[47*(2*OPERAND_NUM_WORDS-0)-1:0]	si_lsb;
+	
+	
+	wire	[47*(2*OPERAND_NUM_WORDS-1)-1:0]	si_msb_new;
+	wire	[47*(2*OPERAND_NUM_WORDS-0)-1:0]	si_lsb_new;
+
+	generate for (i=0; i<16; i=i+1)
+		begin : gen_si_lsb_new
+			assign si_lsb_new[47*i+:47] = mac[15-i];
+		end
+	endgenerate
+	
+	generate for (i=1; i<16; i=i+1)
+		begin : gen_si_msb_new
+			assign si_msb_new[47*(15-i)+:47] = mac_clear[i] ? mac[i] : si_msb[47*(15-i)+:47];
+		end
+	endgenerate
+	
+	always @(posedge clk) begin
+		//
+		if (shift_si) begin
+			si_msb <= {{2*47{1'b0}}, si_msb[15*47-1:2*47]};
+			si_lsb <= {si_msb[2*47-1:0], si_lsb[16*47-1:2*47]};
+		end else begin
+		
+			if (store_si_msb)
+				si_msb <= si_msb_new;
+			
+			if (store_si_lsb)
+				si_lsb <= si_lsb_new;
+		end
+			
+	end
+	
+				
+		//
+		// Accumulators
+		//
+	wire	[46: 0]	add48_cw0_s;
+	wire	[46: 0]	add48_cw1_s;
+	
+	
+		//
+		// cw0, b, cw1, b
+		//
+	reg	[30: 0]	si_prev_dly;
+	reg	[15: 0]	si_next_dly;
+	
+	always @(posedge clk)
+		//
+		if (shift_si)
+			si_prev_dly <= si_lsb[93:63];
+		else
+			si_prev_dly <= {31{1'b0}};
+			
+	always @(posedge clk)
+		//
+		si_next_dly <= si_lsb[62:47];
+	
+	wire	[46: 0]	add48_cw0_a = si_lsb[46:0];
+	wire	[46: 0]	add48_cw0_b = {{16{1'b0}}, si_prev_dly};
+	
+	wire	[46: 0]	add48_cw1_a = add48_cw0_s;
+	wire	[46: 0]	add48_cw1_b = {{15{1'b0}}, si_next_dly, mask_cw1_sum ? {16{1'b0}} : {1'b0, add48_cw1_s[46:32]}};	
+	
+	adder47_wrapper add48_cw0_inst
+	(
+		.clk	(clk),
+		.a		(add48_cw0_a),
+		.b		(add48_cw0_b),
+		.s		(add48_cw0_s)
+	);
+	
+	adder47_wrapper add48_cw1_inst
+	(
+		.clk	(clk),
+		.a		(add48_cw1_a),
+		.b		(add48_cw1_b),
+		.s		(add48_cw1_s)
+	);
+	
+	
+	
+		//
+		// Full-Size Product
+		//
+	reg	[WORD_COUNTER_WIDTH:0]	bram_c_addr;
+	
+	wire	[WORD_COUNTER_WIDTH:0]	reduce_c_addr;
+	wire	[                31:0]	reduce_c_word;
+	
+	always @(posedge clk)
+		//
+		if (store_c_word)
+			bram_c_addr <= bram_c_addr + 1'b1;
+		else
+			bram_c_addr <= {2*WORD_COUNTER_WIDTH{1'b0}};
+	
+	bram_1rw_1ro_readfirst #
+	(
+		.MEM_WIDTH			(32),
+		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH + 1)
+	)
+	bram_c_inst
+	(
+		.clk		(clk),
+
+		.a_addr	(bram_c_addr),
+		.a_wr		(store_c_word),
+		.a_in		(add48_cw1_s[31:0]),
+		.a_out	(),
+
+		.b_addr	(reduce_c_addr),
+		.b_out	(reduce_c_word)
+	);
+	
+	
+		//
+		// Reduction Stage
+		//
+	modular_reductor_256 reduce_256_inst
+	(
+		.clk		(clk),
+		.rst_n	(rst_n),
+		
+		.ena		(reduce_start),
+		.rdy		(reduce_done),
+		
+		.x_addr	(reduce_c_addr),
+		.n_addr	(n_addr),
+		.p_addr	(p_addr),
+		.p_wren	(p_wren),
+		
+		.x_din	(reduce_c_word),
+		.n_din	(n_din),
+		.p_dout	(p_dout)
+	);
+	
+		
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/modular/modular_reductor_256.v b/rtl/modular/modular_reductor_256.v
new file mode 100644
index 0000000..774f42e
--- /dev/null
+++ b/rtl/modular/modular_reductor_256.v
@@ -0,0 +1,666 @@
+//------------------------------------------------------------------------------
+//
+// modular_reductor_256.v
+// -----------------------------------------------------------------------------
+// Modular reductor.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module modular_reductor_256
+	(
+		clk, rst_n,
+		ena, rdy,
+		x_addr, n_addr, p_addr, p_wren,
+		x_din, n_din, p_dout
+	);
+		
+		//
+		// Constants
+		//
+	localparam	OPERAND_NUM_WORDS		= 8;
+	localparam	WORD_COUNTER_WIDTH	= 3;
+	
+	
+		//
+		// Handy Numbers
+		//
+	localparam	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_ZERO	= 0;
+	localparam	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_LAST	= 2 * OPERAND_NUM_WORDS - 1;
+	
+	
+		//
+		// Handy Functions
+		//
+	function	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_PREVIOUS_OR_LAST;
+		input	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_CURRENT;
+		begin
+			WORD_INDEX_PREVIOUS_OR_LAST = (WORD_INDEX_CURRENT > WORD_INDEX_ZERO) ?
+				WORD_INDEX_CURRENT - 1'b1 : WORD_INDEX_LAST;
+		end
+	endfunction
+	
+	
+		//
+		// Ports
+		//
+	input		wire										clk;		// system clock
+	input		wire										rst_n;	// active-low async reset
+	
+	input		wire										ena;		// enable input
+	output	wire										rdy;		// ready output
+	
+	output	wire	[WORD_COUNTER_WIDTH-0:0]	x_addr;	// index of current X word
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	n_addr;	// index of current N word
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	p_addr;	// index of current P word
+	output	wire										p_wren;	// store current P word now	
+	
+	input		wire	[                  31:0]	x_din;	// X
+	input		wire	[                  31:0]	n_din;	// N (must be P-256!)
+	output	wire	[                  31:0]	p_dout;	// P = X mod N
+	
+	
+		//
+		// Word Indices
+		//
+	reg	[WORD_COUNTER_WIDTH:0]	index_x;
+	
+		
+		/* map registers to output ports */
+	assign x_addr	= index_x;
+	
+	
+		//
+		// FSM
+		//
+	localparam	FSM_SHREG_WIDTH	= (2 * OPERAND_NUM_WORDS + 1) + (5 * 2) + 1;
+	
+	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
+	
+	assign rdy = fsm_shreg[0];
+	
+	wire [2 * OPERAND_NUM_WORDS - 1:0]	fsm_shreg_inc_index_x	= fsm_shreg[FSM_SHREG_WIDTH - 0*OPERAND_NUM_WORDS - 1 -: 2 * OPERAND_NUM_WORDS];
+	wire [2 * OPERAND_NUM_WORDS - 1:0]	fsm_shreg_store_word_z	= fsm_shreg[FSM_SHREG_WIDTH - 0*OPERAND_NUM_WORDS - 2 -: 2 * OPERAND_NUM_WORDS];
+	wire [2 *                 5 - 1:0]	fsm_shreg_reduce_stages	= fsm_shreg[                                        1 +: 2 *                 5];
+	
+	wire [5-1:0] fsm_shreg_reduce_stage_start;
+	wire [5-1:0] fsm_shreg_reduce_stage_stop;
+	
+	genvar s;
+	generate for (s=0; s<5; s=s+1)
+		begin : gen_fsm_shreg_reduce_stages
+			assign fsm_shreg_reduce_stage_start[5 - (s + 1)]	= fsm_shreg_reduce_stages[2 * (5 - s) - 1];
+			assign fsm_shreg_reduce_stage_stop[5 - (s + 1)]	= fsm_shreg_reduce_stages[2 * (5 - s) - 2];
+		end
+	endgenerate
+	
+	wire inc_index_x	= |fsm_shreg_inc_index_x;
+	wire store_word_z	= |fsm_shreg_store_word_z;
+	wire reduce_start	= |fsm_shreg_reduce_stage_start;
+	wire reduce_stop	= |fsm_shreg_reduce_stage_stop;
+	wire store_p		=  fsm_shreg_reduce_stage_stop[0];
+	
+	
+	wire	reduce_adder0_done;
+	wire	reduce_adder1_done;
+	wire	reduce_subtractor_done;
+	
+	wire	reduce_done_all = reduce_adder0_done & reduce_adder1_done & reduce_subtractor_done;
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)
+			//
+			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
+			//
+		else begin
+			//
+			if (rdy)
+				//
+				fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
+			//
+			else if (!reduce_stop || reduce_done_all)
+				//
+				fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
+			//
+		end
+	
+		
+		//
+		// Word Index Increment Logic
+		//
+	always @(posedge clk)
+		//
+		if (rdy)
+			//
+			index_x <= WORD_INDEX_LAST;
+			//
+		else if (inc_index_x)
+			//
+			index_x	<= WORD_INDEX_PREVIOUS_OR_LAST(index_x);
+			
+			
+		//
+		// Look-up Table
+		//
+		
+		// TODO: Explain s5!!!
+		
+	reg	[9*WORD_COUNTER_WIDTH-1:0]	z_addr;	//
+	reg	[9                   -1:0]	z_wren;	//
+	reg	[9                   -1:0]	z_mask;	// mask input to store zero word
+	reg	[9                   -1:0]	z_save;	// save previous word once again
+	
+	always @(posedge clk)
+		//
+		if (inc_index_x)
+			//
+			case (index_x)
+				//
+				//                     s9     s8     s7     s6     s5     s4     s3     s2     s1
+				//                     ||     ||     ||     ||     ||     ||     ||     ||     ||
+				4'd00:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd00};
+				4'd01:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd01};
+				4'd02:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd02};
+				4'd03:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd03};
+				4'd04:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd04};
+				4'd05:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd05};
+				4'd06:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd06};
+				4'd07:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd07};
+				4'd08:	z_addr <= {3'd02, 3'd03, 3'd04, 3'd06, 3'd07, 3'd00, 3'd00, 3'd00, 3'dxx};
+				4'd09:	z_addr <= {3'd03, 3'd04, 3'd06, 3'd03, 3'd00, 3'd01, 3'd01, 3'd01, 3'dxx};
+				4'd10:	z_addr <= {3'd04, 3'd05, 3'd05, 3'd07, 3'd01, 3'd02, 3'd02, 3'd02, 3'dxx};
+				4'd11:	z_addr <= {3'd05, 3'd06, 3'd07, 3'd00, 3'd02, 3'd03, 3'd07, 3'd03, 3'dxx};
+				4'd12:	z_addr <= {3'd06, 3'd07, 3'd00, 3'd01, 3'd06, 3'd04, 3'd03, 3'd04, 3'dxx};
+				4'd13:	z_addr <= {3'd07, 3'd00, 3'd01, 3'd02, 3'd03, 3'd05, 3'd04, 3'd05, 3'dxx};
+				4'd14:	z_addr <= {3'd00, 3'd01, 3'd02, 3'd04, 3'd04, 3'd06, 3'd05, 3'd06, 3'dxx};
+				4'd15:	z_addr <= {3'd01, 3'd02, 3'd03, 3'd05, 3'd05, 3'd07, 3'd06, 3'd07, 3'dxx};
+				//
+            default:	z_addr <= {9*WORD_COUNTER_WIDTH{1'bX}};
+				//
+         endcase
+	
+	always @(posedge clk)
+		//
+		case (index_x)
+			//
+			//                     9     8     7     6     5     4     3     2     1
+			//                     |     |     |     |     |     |     |     |     |
+			4'd00:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+			4'd01:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+			4'd02:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+			4'd03:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+			4'd04:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+			4'd05:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+			4'd06:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+			4'd07:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+			4'd08:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+			4'd09:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+			4'd10:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+			4'd11:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+			4'd12:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+			4'd13:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+			4'd14:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+			4'd15:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+			//
+			default:	z_wren <= {9{1'b0}};
+			//
+		endcase
+		
+	always @(posedge clk)
+		//
+		if (inc_index_x)
+			//
+			case (index_x)
+				//
+				//                     9     8     7     6     5     4     3     2     1
+				//                     |     |     |     |     |     |     |     |     |
+				4'd00:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd01:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd02:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd03:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd04:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd05:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd06:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd07:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd08:	z_mask <= {1'b1, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
+				4'd09:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
+				4'd10:	z_mask <= {1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
+				4'd11:	z_mask <= {1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0, 1'b0};
+				4'd12:	z_mask <= {1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0};
+				4'd13:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0};
+				4'd14:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd15:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				//
+            default:	z_mask <= {9{1'bX}};
+				//
+         endcase
+			
+	always @(posedge clk)
+		//
+		if (inc_index_x)
+			//
+			case (index_x)
+				//
+				//                     9     8     7     6     5     4     3     2     1
+				//                     |     |     |     |     |     |     |     |     |
+				4'd00:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd01:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd02:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd03:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd04:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd05:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd06:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd07:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd08:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd09:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd10:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd11:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd12:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd13:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd14:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				4'd15:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+				//
+            default:	z_save <= {9{1'bX}};
+				//
+         endcase
+		
+		
+		//
+		// Intermediate Numbers
+		//
+	reg	[WORD_COUNTER_WIDTH-1:0]	reduce_z_addr[1:9];
+	wire	[                32-1:0]	reduce_z_dout[1:9];
+	
+	reg	[31: 0]	x_din_dly;
+	always @(posedge clk)
+		//
+		x_din_dly <= x_din;
+		
+	
+	genvar z;
+	generate for (z=1; z<=9; z=z+1)
+		//
+		begin : gen_z_bram
+			//
+			bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+			bram_c_inst
+			(
+				.clk		(clk),
+
+				.a_addr	(z_addr[(z-1) * WORD_COUNTER_WIDTH +: WORD_COUNTER_WIDTH]),
+				.a_wr		(z_wren[z-1] & store_word_z),
+				.a_in		(z_mask[z-1] ? {32{1'b0}} : (z_save[z-1] ? x_din_dly : x_din)),
+				.a_out	(),
+
+				.b_addr	(reduce_z_addr[z]),
+				.b_out	(reduce_z_dout[z])
+			);
+			//
+		end
+		//
+	endgenerate
+		
+		
+		
+	
+	wire	[                32-1:0]	bram_sum0_wr_din;
+	wire	[WORD_COUNTER_WIDTH-1:0]	bram_sum0_wr_addr;
+	wire										bram_sum0_wr_wren;
+	
+	wire	[                32-1:0]	bram_sum1_wr_din;
+	wire	[WORD_COUNTER_WIDTH-1:0]	bram_sum1_wr_addr;
+	wire										bram_sum1_wr_wren;
+	
+	wire	[                32-1:0]	bram_diff_wr_din;
+	wire	[WORD_COUNTER_WIDTH-1:0]	bram_diff_wr_addr;
+	wire										bram_diff_wr_wren;
+	
+	wire	[                32-1:0]	bram_sum0_rd_dout;
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_sum0_rd_addr;
+	
+	wire	[                32-1:0]	bram_sum1_rd_dout;
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_sum1_rd_addr;
+
+	wire	[                32-1:0]	bram_diff_rd_dout;
+	reg	[WORD_COUNTER_WIDTH-1:0]	bram_diff_rd_addr;
+
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_sum0_inst
+	(
+		.clk		(clk),
+
+		.a_addr	(bram_sum0_wr_addr),
+		.a_wr		(bram_sum0_wr_wren),
+		.a_in		(bram_sum0_wr_din),
+		.a_out	(),
+
+		.b_addr	(bram_sum0_rd_addr),
+		.b_out	(bram_sum0_rd_dout)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_sum1_inst
+	(
+		.clk		(clk),
+
+		.a_addr	(bram_sum1_wr_addr),
+		.a_wr		(bram_sum1_wr_wren),
+		.a_in		(bram_sum1_wr_din),
+		.a_out	(),
+
+		.b_addr	(bram_sum1_rd_addr),
+		.b_out	(bram_sum1_rd_dout)
+	);
+	
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_diff_inst
+	(
+		.clk		(clk),
+
+		.a_addr	(bram_diff_wr_addr),
+		.a_wr		(bram_diff_wr_wren),
+		.a_in		(bram_diff_wr_din),
+		.a_out	(),
+
+		.b_addr	(bram_diff_rd_addr),
+		.b_out	(bram_diff_rd_dout)
+	);
+	
+	
+	
+	wire	[WORD_COUNTER_WIDTH-1:0]	adder0_ab_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	adder1_ab_addr;
+	wire	[WORD_COUNTER_WIDTH-1:0]	subtractor_ab_addr;
+	
+	reg	[                32-1:0]	adder0_a_din;
+	reg	[                32-1:0]	adder0_b_din;
+	
+	reg	[                32-1:0]	adder1_a_din;
+	reg	[                32-1:0]	adder1_b_din;
+	
+	reg	[                32-1:0]	subtractor_a_din;
+	reg	[                32-1:0]	subtractor_b_din;
+	
+	// n_addr - only 1 output, because all modules are in sync
+	
+	modular_adder adder_inst0
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(reduce_start),
+		.rdy			(reduce_adder0_done),
+		
+		.ab_addr		(adder0_ab_addr),
+		.n_addr		(),
+		.s_addr		(bram_sum0_wr_addr),
+		.s_wren		(bram_sum0_wr_wren),
+		
+		.a_din		(adder0_a_din),
+		.b_din		(adder0_b_din),
+		.n_din		(n_din),
+		.s_dout		(bram_sum0_wr_din)
+	);
+	
+	modular_adder adder_inst1
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(reduce_start),
+		.rdy			(reduce_adder1_done),
+		
+		.ab_addr		(adder1_ab_addr),
+		.n_addr		(),
+		.s_addr		(bram_sum1_wr_addr),
+		.s_wren		(bram_sum1_wr_wren),
+		
+		.a_din		(adder1_a_din),
+		.b_din		(adder1_b_din),
+		.n_din		(n_din),
+		.s_dout		(bram_sum1_wr_din)
+	);
+	
+	modular_subtractor subtractor_inst
+	(
+		.clk			(clk),
+		.rst_n		(rst_n),
+		
+		.ena			(reduce_start),
+		.rdy			(reduce_subtractor_done),
+		
+		.ab_addr		(subtractor_ab_addr),
+		.n_addr		(n_addr),
+		.d_addr		(bram_diff_wr_addr),
+		.d_wren		(bram_diff_wr_wren),
+		
+		.a_din		(subtractor_a_din),
+		.b_din		(subtractor_b_din),
+		.n_din		(n_din),
+		.d_dout		(bram_diff_wr_din)
+	);
+	
+	
+		//
+		// address
+		//
+	always @(*)
+		//
+		case (fsm_shreg_reduce_stage_stop)
+			//
+			5'b10000: begin
+				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[2]	= adder0_ab_addr;
+				reduce_z_addr[3]	= adder1_ab_addr;
+				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[6]	= subtractor_ab_addr;
+				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_sum0_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_sum1_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_diff_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+			end
+			//
+			5'b01000: begin
+				reduce_z_addr[1]	= adder0_ab_addr;
+				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[4]	= adder1_ab_addr;
+				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[7]	= subtractor_ab_addr;
+				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_sum0_rd_addr	= adder0_ab_addr;
+				bram_sum1_rd_addr	= adder1_ab_addr;
+				bram_diff_rd_addr = subtractor_ab_addr;
+			end
+			//
+			5'b00100: begin
+				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[5]	= adder0_ab_addr;
+				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[8]	= subtractor_ab_addr;
+				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_sum0_rd_addr	= adder0_ab_addr;
+				bram_sum1_rd_addr	= adder1_ab_addr;
+				bram_diff_rd_addr = subtractor_ab_addr;
+			end
+			//
+			5'b00010: begin
+				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[9]	= subtractor_ab_addr;
+				bram_sum0_rd_addr	= adder0_ab_addr;
+				bram_sum1_rd_addr	= adder0_ab_addr;
+				bram_diff_rd_addr = subtractor_ab_addr;
+			end
+			//
+			5'b00001: begin
+				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_sum0_rd_addr	= adder0_ab_addr;
+				bram_sum1_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_diff_rd_addr = adder0_ab_addr;
+			end			
+			//
+			default: begin
+				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_sum0_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_sum1_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+				bram_diff_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+			end
+			//
+		endcase
+
+	
+	
+		//
+		// adder 0
+		//
+	always @(*) begin
+		//
+		case (fsm_shreg_reduce_stage_stop)
+			5'b10000:	adder0_a_din = reduce_z_dout[2];
+			5'b01000:	adder0_a_din = bram_sum0_rd_dout;
+			5'b00100:	adder0_a_din = bram_sum0_rd_dout;
+			5'b00010:	adder0_a_din = bram_sum0_rd_dout;
+			5'b00001:	adder0_a_din = bram_sum0_rd_dout;
+			default:		adder0_a_din = {32{1'bX}};
+		endcase
+		//
+		case (fsm_shreg_reduce_stage_stop)
+			5'b10000:	adder0_b_din = reduce_z_dout[2];
+			5'b01000:	adder0_b_din = reduce_z_dout[1];
+			5'b00100:	adder0_b_din = reduce_z_dout[5];
+			5'b00010:	adder0_b_din = bram_sum1_rd_dout;
+			5'b00001:	adder0_b_din = bram_diff_rd_dout;
+			default:		adder0_b_din = {32{1'bX}};
+		endcase
+		//
+	end
+	
+		//
+		// adder 1
+		//
+	always @(*) begin
+		//
+		case (fsm_shreg_reduce_stage_stop)
+			5'b10000:	adder1_a_din = reduce_z_dout[3];
+			5'b01000:	adder1_a_din = bram_sum1_rd_dout;
+			5'b00100:	adder1_a_din = bram_sum1_rd_dout;
+			5'b00010:	adder1_a_din = {32{1'bX}};
+			5'b00001:	adder1_a_din = {32{1'bX}};
+			default:		adder1_a_din = {32{1'bX}};
+		endcase
+		//
+		case (fsm_shreg_reduce_stage_stop)
+			5'b10000:	adder1_b_din = reduce_z_dout[3];
+			5'b01000:	adder1_b_din = reduce_z_dout[4];
+			5'b00100:	adder1_b_din = {32{1'b0}};
+			5'b00010:	adder1_b_din = {32{1'bX}};
+			5'b00001:	adder1_b_din = {32{1'bX}};
+			default:		adder1_b_din = {32{1'bX}};
+		endcase
+		//
+	end
+	
+	
+		//
+		// subtractor
+		//
+	always @(*) begin
+		//
+		case (fsm_shreg_reduce_stage_stop)
+			5'b10000:	subtractor_a_din = {32{1'b0}};
+			5'b01000:	subtractor_a_din = bram_diff_rd_dout;
+			5'b00100:	subtractor_a_din = bram_diff_rd_dout;
+			5'b00010:	subtractor_a_din = bram_diff_rd_dout;
+			5'b00001:	subtractor_a_din = {32{1'bX}};
+			default:		subtractor_a_din = {32{1'bX}};
+		endcase
+		//
+		case (fsm_shreg_reduce_stage_stop)
+			5'b10000:	subtractor_b_din = reduce_z_dout[6];
+			5'b01000:	subtractor_b_din = reduce_z_dout[7];
+			5'b00100:	subtractor_b_din = reduce_z_dout[8];
+			5'b00010:	subtractor_b_din = reduce_z_dout[9];
+			5'b00001:	subtractor_b_din = {32{1'bX}};
+			default:		subtractor_b_din = {32{1'bX}};
+		endcase
+		//
+	end
+
+
+
+
+	assign p_addr	= bram_sum0_wr_addr;
+	assign p_wren	= bram_sum0_wr_wren & store_p;
+	assign p_dout	= bram_sum0_wr_din;
+	
+	
+		
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/modular/modular_subtractor.v b/rtl/modular/modular_subtractor.v
new file mode 100644
index 0000000..322aec4
--- /dev/null
+++ b/rtl/modular/modular_subtractor.v
@@ -0,0 +1,292 @@
+//------------------------------------------------------------------------------
+//
+// modular_subtractor.v
+// -----------------------------------------------------------------------------
+// Modular subtractor.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module modular_subtractor
+	(
+		clk, rst_n,
+		ena, rdy,
+		ab_addr, n_addr, d_addr, d_wren,
+		a_din, b_din, n_din, d_dout
+	);
+
+
+		//
+		// Parameters
+		//
+	parameter	OPERAND_NUM_WORDS		= 8;
+	parameter	WORD_COUNTER_WIDTH	= 3;
+	
+	
+		//
+		// Handy Numbers
+		//
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_ZERO	= 0;
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_LAST	= OPERAND_NUM_WORDS - 1;
+	
+	
+		//
+		// Handy Functions
+		//
+	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_NEXT_OR_ZERO;
+		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
+		begin
+			WORD_INDEX_NEXT_OR_ZERO = (WORD_INDEX_CURRENT < WORD_INDEX_LAST) ?
+				WORD_INDEX_CURRENT + 1'b1 : WORD_INDEX_ZERO;
+		end
+	endfunction
+	
+	
+		//
+		// Ports
+		//
+	input		wire										clk;			// system clock
+	input		wire										rst_n;		// active-low async reset
+	
+	input		wire										ena;			// enable input
+	output	wire										rdy;			// ready output
+	
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	ab_addr;		// index of current A and B words
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	n_addr;		// index of current N word
+	output	wire	[WORD_COUNTER_WIDTH-1:0]	d_addr;		// index of current D word
+	output	wire										d_wren;		// store current D word now
+	
+	input		wire	[                  31:0]	a_din;		// A
+	input		wire	[                  31:0]	b_din;		// B
+	input		wire	[                  31:0]	n_din;		// N
+	output	wire	[                  31:0]	d_dout;		// D = (A - B) mod N
+	
+	
+		//
+		// Word Indices
+		//
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_ab;
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_n;
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_d;
+		
+		/* map registers to output ports */
+	assign ab_addr	= index_ab;
+	assign n_addr	= index_n;
+	assign d_addr	= index_d;
+
+
+		//
+		// Subtractor
+		//
+	wire	[31: 0]	sub32_d;
+	wire				sub32_b_in;
+	wire				sub32_b_out;
+	
+	subtractor32_wrapper subtractor32
+	(
+		.clk		(clk),
+		.a			(a_din),
+		.b			(b_din),
+		.d			(sub32_d),
+		.b_in		(sub32_b_in),
+		.b_out	(sub32_b_out)
+	);
+	
+	
+		//
+		// Adder
+		//
+	wire	[31: 0]	add32_s;
+	wire				add32_c_in;
+	wire				add32_c_out;
+	
+	adder32_wrapper adder32
+	(
+		.clk		(clk),
+		.a			(sub32_d),
+		.b			(n_din),
+		.s			(add32_s),
+		.c_in		(add32_c_in),
+		.c_out	(add32_c_out)
+	);
+	
+	
+		//
+		// FSM
+		//
+		
+	localparam FSM_SHREG_WIDTH = 2*OPERAND_NUM_WORDS + 5;
+	
+	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
+	
+	assign rdy = fsm_shreg[0];
+	
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_ab	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 0)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_n		= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_dif_ab	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 3) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_dif_ab_n	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 3)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_data_d	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (2 * OPERAND_NUM_WORDS + 3)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_d		= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (2 * OPERAND_NUM_WORDS + 4)];
+	
+	wire fsm_latch_msb_borrow	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
+	
+	wire inc_index_ab		= |fsm_shreg_inc_index_ab;
+	wire inc_index_n		= |fsm_shreg_inc_index_n;
+	wire store_dif_ab		= |fsm_shreg_store_dif_ab;
+	wire store_dif_ab_n	= |fsm_shreg_store_dif_ab_n;
+	wire store_data_d		= |fsm_shreg_store_data_d;
+	wire inc_index_d		= |fsm_shreg_inc_index_d;
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)
+			//
+			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
+			//
+		else begin
+			//
+			if (rdy)	fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
+			//
+			else		fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
+			//
+		end
+		
+	
+		//
+		// Borrow & Carry Masking Logic
+		//
+	reg	sub32_b_mask;
+	reg	add32_c_mask;
+	
+	
+	always @(posedge clk) begin
+		//
+		sub32_b_mask <= (index_ab == WORD_INDEX_ZERO) ? 1'b1 : 1'b0;
+		add32_c_mask <= (index_n  == WORD_INDEX_ZERO) ? 1'b1 : 1'b0;
+		//
+	end
+	
+	assign sub32_b_in = sub32_b_out & ~sub32_b_mask;	
+	assign add32_c_in = add32_c_out & ~add32_c_mask;
+	
+	
+	
+		//
+		// Borrow & Carry Latch Logic
+		//
+	reg sub32_borrow_latch;
+	
+	always @(posedge clk) begin
+		//
+		if (fsm_latch_msb_borrow) sub32_borrow_latch <= sub32_b_out;
+		//
+	end
+
+		
+		//
+		// Intermediate Results
+		//
+	reg	[32*OPERAND_NUM_WORDS-1:0]		d_ab;
+	reg	[32*OPERAND_NUM_WORDS-1:0]		d_ab_n;
+	
+	always @(posedge clk)
+		//
+		if (store_data_d) begin
+			//
+			d_ab		<= {{32{1'bX}}, d_ab[32*OPERAND_NUM_WORDS-1:32]};
+			d_ab_n	<= {{32{1'bX}}, d_ab_n[32*OPERAND_NUM_WORDS-1:32]};		
+			//
+		end else begin
+			//
+			if (store_dif_ab) d_ab <= {sub32_d, d_ab[32*OPERAND_NUM_WORDS-1:32]};
+			if (store_dif_ab_n) d_ab_n <= {add32_s, d_ab_n[32*OPERAND_NUM_WORDS-1:32]};
+			//
+		end
+	
+	
+		//
+		// Word Index Increment Logic
+		//
+	always @(posedge clk)
+		//
+		if (rdy) begin
+			//
+			index_ab		<= WORD_INDEX_ZERO;
+			index_n		<= WORD_INDEX_ZERO;
+			index_d		<= WORD_INDEX_ZERO;
+			//
+		end else begin
+			//
+			if (inc_index_ab) index_ab <= WORD_INDEX_NEXT_OR_ZERO(index_ab);
+			if (inc_index_n)	index_n	<= WORD_INDEX_NEXT_OR_ZERO(index_n);
+			if (inc_index_d)	index_d	<= WORD_INDEX_NEXT_OR_ZERO(index_d);
+			//
+		end
+	
+	
+			//
+			// Output Sum Selector
+			//
+	wire	mux_select_ab_n = sub32_borrow_latch;
+			
+	
+			//
+			// Output Data and Write Enable Logic
+			//
+	reg				d_wren_reg;
+	reg	[31: 0]	d_dout_reg;
+	wire	[31: 0]	d_dout_mux = mux_select_ab_n ? d_ab_n[31:0] : d_ab[31:0];
+	
+	assign d_wren = d_wren_reg;
+	assign d_dout = d_dout_reg;
+	
+	always @(posedge clk)
+		//
+		if (rdy) begin
+			//
+			d_wren_reg	<= 1'b0;
+			d_dout_reg	<= {32{1'bX}};
+			//
+		end else begin
+			//
+			d_wren_reg <= store_data_d;
+			d_dout_reg <= store_data_d ? d_dout_mux : {32{1'bX}};
+			//
+		end			
+
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/multiword/mw_comparator.v b/rtl/multiword/mw_comparator.v
new file mode 100644
index 0000000..b97a6cf
--- /dev/null
+++ b/rtl/multiword/mw_comparator.v
@@ -0,0 +1,220 @@
+//------------------------------------------------------------------------------
+//
+// mw_comparator.v
+// -----------------------------------------------------------------------------
+// Multi-word comparator.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module mw_comparator
+	(
+		clk, rst_n,
+		ena, rdy,
+		xy_addr, x_din, y_din,
+		cmp_l, cmp_e, cmp_g
+	);
+	
+		
+		//
+		// Parameters
+		//
+	parameter	WORD_COUNTER_WIDTH	= 3;
+	parameter	OPERAND_NUM_WORDS		= 8;
+	
+	
+		//
+		// Handy Numbers
+		//
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_ZERO	= 0;
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_LAST	= OPERAND_NUM_WORDS - 1;
+	
+	
+		//
+		// Handy Functions
+		//
+	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_PREV_OR_LAST;
+		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
+		begin
+			WORD_INDEX_PREV_OR_LAST = (WORD_INDEX_CURRENT > WORD_INDEX_ZERO) ?
+				WORD_INDEX_CURRENT - 1'b1 : WORD_INDEX_LAST;
+		end
+	endfunction
+	
+
+		//
+		// Ports
+		//
+	input		wire											clk;			// system clock
+	input		wire											rst_n;		// active-low async reset
+	
+	input		wire											ena;			// enable input
+	output	wire											rdy;			// ready output
+		
+	output	wire	[WORD_COUNTER_WIDTH-1:0]		xy_addr;		// address of current X and Y words
+	input		wire	[                32-1:0]		x_din;		// current X word
+	input		wire	[                32-1:0]		y_din;		// current Y word
+	
+	output	wire											cmp_l;		// X < Y ?
+	output	wire											cmp_e;		// X = Y ?
+	output	wire											cmp_g;		// X > Y ?
+	
+	
+		//
+		// Word Indices
+		//
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_xy;
+	
+	reg										reg_cmp_l;
+	reg										reg_cmp_e;
+	reg										reg_cmp_g;
+	
+	
+		//
+		// Output Mapping
+		//
+	assign xy_addr	= index_xy;
+	
+	assign cmp_l = reg_cmp_l;
+	assign cmp_e = reg_cmp_e;
+	assign cmp_g = reg_cmp_g;
+
+
+		//
+		// FSM
+		//
+	localparam FSM_SHREG_WIDTH = 1 * OPERAND_NUM_WORDS + 3;
+	
+	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
+	
+	assign rdy = fsm_shreg[0];
+	
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_dec_index_xy	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 0)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_calc_leg			= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 3) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
+	wire                         fsm_shreg_calc_leg_last	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
+	
+	wire dec_index_xy		= |fsm_shreg_dec_index_xy;
+	wire calc_leg			= |fsm_shreg_calc_leg;
+	wire calc_leg_last	=  fsm_shreg_calc_leg_last;
+
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)
+			//
+			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
+			//
+		else begin
+			//
+			if (rdy)	fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
+			//
+			else		fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
+			//
+		end
+
+
+		//
+		// Word Index Increment Logic
+		//
+	always @(posedge clk)
+		//
+		if (rdy)						index_xy <= WORD_INDEX_LAST;
+		else if (dec_index_xy)	index_xy <= WORD_INDEX_PREV_OR_LAST(index_xy);
+		
+		
+		//
+		// 32-bit Subtractor
+		//	
+	wire	[31: 0]	sub32_d_out;
+	wire				sub32_b_in;
+	wire				sub32_b_out;
+	
+	subtractor32_wrapper subtractor32_inst
+	(
+		.clk		(clk),
+		
+		.a			(x_din),
+		.b			(y_din),
+		
+		.d			(sub32_d_out),
+		
+		.b_in		(sub32_b_in),
+		.b_out	(sub32_b_out)
+	);
+	
+	
+		//
+		// Borrow Masking Logic
+		//
+	reg	sub32_b_mask;
+	
+	always @(posedge clk)
+		//
+		sub32_b_mask <= (index_xy  == WORD_INDEX_LAST) ? 1'b1 : 1'b0;
+		
+	assign sub32_b_in = sub32_b_out & ~sub32_b_mask;
+		
+		//
+		// Output Logic
+		//
+	wire	cmp_unresolved = !(cmp_l || cmp_g);
+	
+	wire	cmp_borrow_is_set				= (sub32_b_out ==  1'b1) ? 1'b1 : 1'b0;
+	wire	cmp_difference_is_nonzero	= (sub32_d_out != 32'd0) ? 1'b1 : 1'b0;
+	
+	always @(posedge clk)
+		//
+		if (rdy) begin
+			//
+			if (ena) begin
+				//
+				reg_cmp_l	<= 1'b0;
+				reg_cmp_e	<= 1'b0;
+				reg_cmp_g	<= 1'b0;
+				//
+			end
+			//
+		end else if (cmp_unresolved && calc_leg) begin
+			//
+			if ( cmp_borrow_is_set)																	reg_cmp_l <= 1'b1;
+			if (!cmp_borrow_is_set &&  cmp_difference_is_nonzero)							reg_cmp_g <= 1'b1;
+			if (!cmp_borrow_is_set && !cmp_difference_is_nonzero && calc_leg_last)	reg_cmp_e <= 1'b1;
+			//
+		end
+		
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/multiword/mw_mover.v b/rtl/multiword/mw_mover.v
new file mode 100644
index 0000000..5db95a7
--- /dev/null
+++ b/rtl/multiword/mw_mover.v
@@ -0,0 +1,175 @@
+//------------------------------------------------------------------------------
+//
+// mw_mover.v
+// -----------------------------------------------------------------------------
+// Multi-word data mover.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module mw_mover
+	(
+		clk, rst_n,
+		ena, rdy,
+		x_addr, y_addr, y_wren,
+		x_din, y_dout
+	);
+
+
+		//
+		// Parameters
+		//
+	parameter	WORD_COUNTER_WIDTH	= 3;
+	parameter	OPERAND_NUM_WORDS		= 8;
+	
+	
+		//
+		// Handy Numbers
+		//
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_ZERO	= 0;
+	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_LAST	= OPERAND_NUM_WORDS - 1;
+	
+	
+		//
+		// Handy Functions
+		//
+	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_NEXT_OR_ZERO;
+		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
+		begin
+			WORD_INDEX_NEXT_OR_ZERO = (WORD_INDEX_CURRENT < WORD_INDEX_LAST) ?
+				WORD_INDEX_CURRENT + 1'b1 : WORD_INDEX_ZERO;
+		end
+	endfunction
+	
+
+		//
+		// Ports
+		//
+	input		wire											clk;			// system clock
+	input		wire											rst_n;		// active-low async reset
+	
+	input		wire											ena;			// enable input
+	output	wire											rdy;			// ready output
+		
+	output	wire	[WORD_COUNTER_WIDTH-1:0]		x_addr;		// address of current X word
+	output	wire	[WORD_COUNTER_WIDTH-1:0]		y_addr;		// address of current Y word
+	output	wire											y_wren;		// store current Y word
+	
+	input		wire	[                32-1:0]		x_din;		// current X word
+	output	wire	[                32-1:0]		y_dout;		// current Y word
+	
+	
+		//
+		// Word Indices
+		//
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_x;
+	reg	[WORD_COUNTER_WIDTH-1:0]	index_y;
+	
+	
+		//
+		// Output Mapping
+		//
+	assign x_addr	= index_x;
+	assign y_addr	= index_y;
+
+
+		//
+		// FSM
+		//
+	localparam FSM_SHREG_WIDTH = 1 * OPERAND_NUM_WORDS + 2;
+	
+	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
+	
+	assign rdy = fsm_shreg[0];
+	
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_x	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 0)];
+	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_y	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1)];
+	
+	wire inc_index_x	= |fsm_shreg_inc_index_x;
+	wire inc_index_y	= |fsm_shreg_inc_index_y;
+	wire store_word_y	= |fsm_shreg_inc_index_x;
+
+	
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)
+			//
+			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
+			//
+		else begin
+			//
+			if (rdy)	fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
+			//
+			else		fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
+			//
+		end
+
+
+		//
+		// Word Index Increment Logic
+		//
+	always @(posedge clk)
+		//
+		if (rdy) begin
+			index_x <= WORD_INDEX_ZERO;
+			index_y <= WORD_INDEX_ZERO;
+		end else begin
+			if (inc_index_x)	index_x <= WORD_INDEX_NEXT_OR_ZERO(index_x);
+			if (inc_index_y)	index_y <= WORD_INDEX_NEXT_OR_ZERO(index_y);
+		end
+		
+		
+		//
+		// Write Enable Logic
+		//
+	reg	y_wren_reg;
+	
+	assign y_wren = y_wren_reg;
+	
+	always @(posedge clk)
+		//
+		if (rdy)		y_wren_reg	<= 1'b0;
+		else			y_wren_reg	<= store_word_y;
+		
+		
+		//
+		// Output Logic
+		//
+	assign y_dout = x_din;
+		
+	
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/util/bram_1rw_1ro_readfirst.v b/rtl/util/bram_1rw_1ro_readfirst.v
new file mode 100644
index 0000000..28782c2
--- /dev/null
+++ b/rtl/util/bram_1rw_1ro_readfirst.v
@@ -0,0 +1,101 @@
+//======================================================================
+//
+// Copyright (c) 2015, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module bram_1rw_1ro_readfirst
+  #(parameter MEM_WIDTH            = 32,
+    parameter MEM_ADDR_BITS        = 8)
+   (
+    input wire                     clk,
+
+    input wire [MEM_ADDR_BITS-1:0] a_addr,
+    input wire                     a_wr,
+    input wire [MEM_WIDTH-1:0]     a_in,
+    output wire [MEM_WIDTH-1:0]    a_out,
+
+    input wire [MEM_ADDR_BITS-1:0] b_addr,
+    output wire [MEM_WIDTH-1:0]    b_out
+    );
+
+
+   //
+   // BRAM
+   //
+   (* RAM_STYLE="BLOCK" *)
+   reg [MEM_WIDTH-1:0]             bram[0:(2**MEM_ADDR_BITS)-1];
+	
+	
+	//
+	// Initialization
+	//
+	/**
+	integer c;
+	initial begin
+		for (c=0; c<(2**MEM_ADDR_BITS); c=c+1)
+			bram[c] = {MEM_WIDTH{1'b0}};
+	end
+	**/
+	
+
+
+   //
+   // Output Registers
+   //
+   reg [MEM_WIDTH-1:0]             bram_reg_a;
+   reg [MEM_WIDTH-1:0]             bram_reg_b;
+
+   assign a_out = bram_reg_a;
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Write Port A
+   //
+   always @(posedge clk) begin
+      //
+      bram_reg_a <= bram[a_addr];
+      //
+      if (a_wr) bram[a_addr] <= a_in;
+      //
+   end
+
+
+   //
+   // Read-Only Port B
+   //
+   always @(posedge clk)
+     //
+     bram_reg_b <= bram[b_addr];
+
+
+endmodule



More information about the Commits mailing list