[Cryptech-Commits] [user/shatov/x25519_fpga_model] 02/02: Cleaned up and optimized microcode.

git at cryptech.is git at cryptech.is
Mon Jun 4 20:44:34 UTC 2018


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/x25519_fpga_model.

commit 23cc981edb675625484eaff13e440781045a4973
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Jun 4 22:00:57 2018 +0300

    Cleaned up and optimized microcode.
    
    I'm pretty sure that Montgomery ladder is as fast as possible, the overhead
    is only one bank swap operation which is ~0,8%. I suspect that conversion to
    affine coordinates might have some potential for improvement, it does 15 swap
    operations, but given that it also does 254 multiplications, the overhead is
    ~0,6%. In his original paper Bernstein estimated conversion to be ~7% of entire
    X25519 computation, in that sense the potential improvement in modular
    inversion is negligeable.
---
 x25519_fpga_curve_microcode.cpp | 1144 +++++++++++----------------------------
 1 file changed, 323 insertions(+), 821 deletions(-)

diff --git a/x25519_fpga_curve_microcode.cpp b/x25519_fpga_curve_microcode.cpp
index 7d40d70..11c8cc6 100644
--- a/x25519_fpga_curve_microcode.cpp
+++ b/x25519_fpga_curve_microcode.cpp
@@ -43,55 +43,12 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <string.h>
 #include "x25519_fpga_model.h"
 
 
 //------------------------------------------------------------------------------
-// Locals
-//------------------------------------------------------------------------------
-static FPGA_BUFFER LADDER_R0_X;
-static FPGA_BUFFER LADDER_R0_Z;
-
-static FPGA_BUFFER LADDER_R1_X;
-static FPGA_BUFFER LADDER_R1_Z;
-
-static FPGA_BUFFER LADDER_T0_X;
-static FPGA_BUFFER LADDER_T0_Z;
-
-static FPGA_BUFFER LADDER_T1_X;
-static FPGA_BUFFER LADDER_T1_Z;
-
-static FPGA_BUFFER LADDER_S0;
-static FPGA_BUFFER LADDER_S1;
-static FPGA_BUFFER LADDER_D0;
-static FPGA_BUFFER LADDER_D1;
-static FPGA_BUFFER LADDER_QS0;
-static FPGA_BUFFER LADDER_QD0;
-static FPGA_BUFFER LADDER_S0D1;
-static FPGA_BUFFER LADDER_S1D0;
-static FPGA_BUFFER LADDER_TS;
-static FPGA_BUFFER LADDER_TD;
-static FPGA_BUFFER LADDER_QTD;
-static FPGA_BUFFER LADDER_T0;
-static FPGA_BUFFER LADDER_TA;
-static FPGA_BUFFER LADDER_T1;
-
-static FPGA_BUFFER REDUCE_R1;
-static FPGA_BUFFER REDUCE_R2;
-static FPGA_BUFFER REDUCE_T_1;
-static FPGA_BUFFER REDUCE_T_10;
-static FPGA_BUFFER REDUCE_T_1001;
-static FPGA_BUFFER REDUCE_T_1011;
-static FPGA_BUFFER REDUCE_T_X5;
-static FPGA_BUFFER REDUCE_T_X10;
-static FPGA_BUFFER REDUCE_T_X20;
-static FPGA_BUFFER REDUCE_T_X40;
-static FPGA_BUFFER REDUCE_T_X50;
-static FPGA_BUFFER REDUCE_T_X100;
-
-
-//------------------------------------------------------------------------------
-// Error Handle
+// Error Handler
 //------------------------------------------------------------------------------
 #define uop_fatal(msg)	{(void)printf("%s\n",msg);exit(EXIT_FAILURE);}
 
@@ -99,159 +56,113 @@ static FPGA_BUFFER REDUCE_T_X100;
 //------------------------------------------------------------------------------
 // Storage Buffers
 //------------------------------------------------------------------------------
-static FPGA_BUFFER BANK_INT[4][64];
-static bool bank_flags[4][64];
-static FPGA_BUFFER BANK_EXT_X;
-static FPGA_BUFFER BANK_EXT_Y;
+static FPGA_BUFFER BUF_LO[64];
+static FPGA_BUFFER BUF_HI[64];
+
+static bool buf_flag_lo[64];
+static bool buf_flag_hi[64];
+
 
 //------------------------------------------------------------------------------
 enum UOP_BANK
 //------------------------------------------------------------------------------
 {
-	UOP_BANK_INT_A	= 0,
-	UOP_BANK_INT_B	= 1,
-	UOP_BANK_INT_C	= 2,
-	UOP_BANK_INT_D	= 3,
-	UOP_BANK_EXT
+	BANK_LO, BANK_HI
 };
 
+
 //------------------------------------------------------------------------------
-enum UOP_SRC_OPERAND
+enum UOP_OPERAND
 //------------------------------------------------------------------------------
 {
-	UOP_SRC_INT_LADDER_R0_X		=  0,
-	UOP_SRC_INT_LADDER_R0_Z		=  1,
-
-	UOP_SRC_INT_LADDER_R1_X		=  2,
-	UOP_SRC_INT_LADDER_R1_Z		=  3,
-
-	UOP_SRC_INT_LADDER_T0_X		=  4,
-	UOP_SRC_INT_LADDER_T0_Z		=  5,
-
-	UOP_SRC_INT_LADDER_T1_X		=  6,
-	UOP_SRC_INT_LADDER_T1_Z		=  7,
-
-	UOP_SRC_INT_LADDER_S0		=  8,
-	UOP_SRC_INT_LADDER_S1		=  9,
-	UOP_SRC_INT_LADDER_D0		= 10,
-	UOP_SRC_INT_LADDER_D1		= 11,
-	UOP_SRC_INT_LADDER_QS0		= 12,
-	UOP_SRC_INT_LADDER_QD0		= 13,
-	UOP_SRC_INT_LADDER_S0D1		= 14,
-	UOP_SRC_INT_LADDER_S1D0		= 15,
-	UOP_SRC_INT_LADDER_TS		= 16,
-	UOP_SRC_INT_LADDER_TD		= 17,
-	UOP_SRC_INT_LADDER_QTD		= 18,
-	UOP_SRC_INT_LADDER_T0		= 19,
-	UOP_SRC_INT_LADDER_TA		= 20,
-	UOP_SRC_INT_LADDER_T1		= 21,
-
-	UOP_SRC_INT_REDUCE_R1		= 22,
-	UOP_SRC_INT_REDUCE_R2		= 23,
-	UOP_SRC_INT_REDUCE_T_1		= 24,
-	UOP_SRC_INT_REDUCE_T_10		= 25,
-	UOP_SRC_INT_REDUCE_T_1001	= 26,
-	UOP_SRC_INT_REDUCE_T_1011	= 27,
-	UOP_SRC_INT_REDUCE_T_X5		= 28,
-	UOP_SRC_INT_REDUCE_T_X10	= 29,
-	UOP_SRC_INT_REDUCE_T_X20	= 30,
-	UOP_SRC_INT_REDUCE_T_X40	= 31,
-	UOP_SRC_INT_REDUCE_T_X50	= 32,
-	UOP_SRC_INT_REDUCE_T_X100	= 33,
-
-	UOP_SRC_EXT_ZERO,
-	UOP_SRC_EXT_ONE,
-
-	UOP_SRC_EXT_A24,
-
-	UOP_SRC_EXT_X
+	CONST_ZERO		=  0,
+	CONST_ONE		=  1,
+	CONST_A24		=  2,
+
+	LADDER_R0_X		=  3,
+	LADDER_R0_Z		=  4,
+
+	LADDER_R1_X		=  5,
+	LADDER_R1_Z		=  6,
+
+	LADDER_T0_X		=  7,
+	LADDER_T0_Z		=  8,
+
+	LADDER_T1_X		=  9,
+	LADDER_T1_Z		= 10,
+
+	LADDER_S0		= 11,
+	LADDER_S1		= 12,
+
+	LADDER_D0		= 13,
+	LADDER_D1		= 14,
+
+	LADDER_QS0		= 15,
+	LADDER_QD0		= 16,
+
+	LADDER_S0D1		= 17,
+	LADDER_S1D0		= 18,
+
+	LADDER_TS		= 19,
+	LADDER_TD		= 20,
+
+	LADDER_QTD		= 21,
+
+	LADDER_T0		= 22,
+	LADDER_TA		= 23,
+	LADDER_T1		= 24,
+
+	LADDER_P_X		= 25,
+
+	LADDER_DUMMY	= 26,
+
+	REDUCE_R1		= 27,
+	REDUCE_R2		= 28,
+
+	REDUCE_T_1		= 29,
+	REDUCE_T_10		= 30,
+	REDUCE_T_1001	= 31,
+	REDUCE_T_1011	= 32,
+
+	REDUCE_T_X5		= 33,
+	REDUCE_T_X10	= 34,
+	REDUCE_T_X20	= 35,
+	REDUCE_T_X40	= 36,
+	REDUCE_T_X50	= 37,
+	REDUCE_T_X100	= 38
 };
 
 
 //------------------------------------------------------------------------------
-enum UOP_DST_OPERAND
+enum UOP_MODULUS
 //------------------------------------------------------------------------------
 {
-	UOP_DST_INT_LADDER_R0_X		=  0,
-	UOP_DST_INT_LADDER_R0_Z		=  1,
-
-	UOP_DST_INT_LADDER_R1_X		=  2,
-	UOP_DST_INT_LADDER_R1_Z		=  3,
-
-	UOP_DST_INT_LADDER_T0_X		=  4,
-	UOP_DST_INT_LADDER_T0_Z		=  5,
-
-	UOP_DST_INT_LADDER_T1_X		=  6,
-	UOP_DST_INT_LADDER_T1_Z		=  7,
-
-	UOP_DST_INT_LADDER_S0		=  8,
-	UOP_DST_INT_LADDER_S1		=  9,
-	UOP_DST_INT_LADDER_D0		= 10,
-	UOP_DST_INT_LADDER_D1		= 11,
-	UOP_DST_INT_LADDER_QS0		= 12,
-	UOP_DST_INT_LADDER_QD0		= 13,
-	UOP_DST_INT_LADDER_S0D1		= 14,
-	UOP_DST_INT_LADDER_S1D0		= 15,
-	UOP_DST_INT_LADDER_TS		= 16,
-	UOP_DST_INT_LADDER_TD		= 17,
-	UOP_DST_INT_LADDER_QTD		= 18,
-	UOP_DST_INT_LADDER_T0		= 19,
-	UOP_DST_INT_LADDER_TA		= 20,
-	UOP_DST_INT_LADDER_T1		= 21,
-
-	UOP_DST_INT_REDUCE_R1		= 22,
-	UOP_DST_INT_REDUCE_R2		= 23,
-	UOP_DST_INT_REDUCE_T_1		= 24,
-	UOP_DST_INT_REDUCE_T_10		= 25,
-	UOP_DST_INT_REDUCE_T_1001	= 26,
-	UOP_DST_INT_REDUCE_T_1011	= 27,
-	UOP_DST_INT_REDUCE_T_X5		= 28,
-	UOP_DST_INT_REDUCE_T_X10	= 29,
-	UOP_DST_INT_REDUCE_T_X20	= 30,
-	UOP_DST_INT_REDUCE_T_X40	= 31,
-	UOP_DST_INT_REDUCE_T_X50	= 32,
-	UOP_DST_INT_REDUCE_T_X100	= 33,
-
-	UOP_DST_EXT_Y,
-
-	UOP_DST_DUMMY
+	MOD_1P,
+	MOD_2P
 };
 
-void dump_fpga_buffer(const char *msg, const FPGA_BUFFER *buf)
-{
-	printf("%s", msg);
-	for (int i=FPGA_OPERAND_NUM_WORDS; i>0; i--)
-		printf("%08x ", buf->words[i]);
-	printf("\n");
-}
 
 //------------------------------------------------------------------------------
-// Prototypes
+enum UOP_MATH
 //------------------------------------------------------------------------------
-static void uop_move	(UOP_BANK src_bank_1, UOP_SRC_OPERAND src_operand_1,
-						 UOP_BANK src_bank_2, UOP_SRC_OPERAND src_operand_2,
-						 UOP_BANK dst_bank_1, UOP_DST_OPERAND dst_operand_1,
-						 UOP_BANK dst_bank_2, UOP_DST_OPERAND dst_operand_2);
+{
+	ADD, SUB, MUL
+};
 
-static void uop_add		(UOP_BANK src_bank_1, UOP_SRC_OPERAND src_operand_1,
-						 UOP_BANK src_bank_2, UOP_SRC_OPERAND src_operand_2,
-						 UOP_BANK dst_bank_1, UOP_DST_OPERAND dst_operand_1,
-						 UOP_BANK dst_bank_2, UOP_DST_OPERAND dst_operand_2,
-						 FPGA_BUFFER *modulus);
 
-static void uop_sub		(UOP_BANK src_bank_1, UOP_SRC_OPERAND src_operand_1,
-						 UOP_BANK src_bank_2, UOP_SRC_OPERAND src_operand_2,
-						 UOP_BANK dst_bank_1, UOP_DST_OPERAND dst_operand_1,
-						 UOP_BANK dst_bank_2, UOP_DST_OPERAND dst_operand_2,
-						 FPGA_BUFFER *modulus);
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+static void uop_move	(UOP_BANK src, UOP_OPERAND s_op1, UOP_OPERAND s_op2,
+						 UOP_BANK dst, UOP_OPERAND d_op1, UOP_OPERAND d_op2);
 
-static void uop_mul		(UOP_BANK src_bank_1, UOP_SRC_OPERAND src_operand_1,
-						 UOP_BANK src_bank_2, UOP_SRC_OPERAND src_operand_2,
-						 UOP_BANK dst_bank_1, UOP_DST_OPERAND dst_operand_1,
-						 UOP_BANK dst_bank_2, UOP_DST_OPERAND dst_operand_2);
+static void uop_calc	(UOP_MATH math,
+						 UOP_BANK src, UOP_OPERAND s_op1, UOP_OPERAND s_op2,
+						 UOP_BANK dst, UOP_OPERAND d_op,
+						 UOP_MODULUS mod);
 
-static void bank2buffer(UOP_BANK bank, UOP_DST_OPERAND operand, FPGA_BUFFER *buffer);
-static void buffer2bank(FPGA_BUFFER *buffer, UOP_BANK bank, UOP_SRC_OPERAND operand);
+static void uop_load	(FPGA_BUFFER *mem, UOP_BANK dst, UOP_OPERAND d_op);
+static void uop_stor	(UOP_BANK src, UOP_OPERAND s_op, FPGA_BUFFER *mem);
 
 
 //------------------------------------------------------------------------------
@@ -268,761 +179,352 @@ static void buffer2bank(FPGA_BUFFER *buffer, UOP_BANK bank, UOP_SRC_OPERAND oper
 void fpga_curve_scalar_multiply_microcode(FPGA_BUFFER *PX, FPGA_BUFFER *K, FPGA_BUFFER *QX)
 //------------------------------------------------------------------------------
 {
+	bool k_bit, s;							// 1-bit values
+	FPGA_WORD k_word;						// current word of multiplier
 	int word_count, bit_count, cyc_count;	// counters
 
+		// reset bank flags
+	(void)memset(buf_flag_lo, 0, sizeof buf_flag_lo);
+	(void)memset(buf_flag_hi, 0, sizeof buf_flag_hi);
 
-	int i, j;
-	for (i=0; i<4; i++)
-		for (j=0; j<64; j++)
-			bank_flags[i][j] = false;
-
-
-		// pre
-	fpga_multiword_copy(PX, &BANK_EXT_X);
-
-		// initialization
-	
+		// initialize internal banks
+	fpga_multiword_copy(&X25519_ZERO, &BUF_LO[CONST_ZERO]);
+	fpga_multiword_copy(&X25519_ZERO, &BUF_HI[CONST_ZERO]);
 
-	// fpga_multiword_copy(&X25519_ONE,  &LADDER_R0_X);
- 	// fpga_multiword_copy(&X25519_ZERO, &LADDER_R0_Z);
-	uop_move	(UOP_BANK_EXT,   UOP_SRC_EXT_ONE,     UOP_BANK_EXT,   UOP_SRC_EXT_ZERO,
-				 UOP_BANK_INT_A, UOP_DST_INT_LADDER_R0_X, UOP_BANK_INT_B, UOP_DST_INT_LADDER_R0_Z);
+	fpga_multiword_copy(&X25519_ONE, &BUF_LO[CONST_ONE]);
+	fpga_multiword_copy(&X25519_ONE, &BUF_HI[CONST_ONE]);
 
-	// fpga_multiword_copy(PX,           &LADDER_R1_X);
-	// fpga_multiword_copy(&X25519_ONE,  &LADDER_R1_Z);
-	uop_move	(UOP_BANK_EXT,   UOP_SRC_EXT_X,       UOP_BANK_EXT,   UOP_SRC_EXT_ONE,
-				 UOP_BANK_INT_A, UOP_DST_INT_LADDER_R1_X, UOP_BANK_INT_B, UOP_DST_INT_LADDER_R1_Z);
+	fpga_multiword_copy(&X25519_A24, &BUF_LO[CONST_A24]);
+	fpga_multiword_copy(&X25519_A24, &BUF_HI[CONST_A24]);
 
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_LADDER_R0_X, &LADDER_R0_X);
-	//bank2buffer(UOP_BANK_INT_B, UOP_DST_INT_LADDER_R0_Z, &LADDER_R0_Z);
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_LADDER_R1_X, &LADDER_R1_X);
-	//bank2buffer(UOP_BANK_INT_B, UOP_DST_INT_LADDER_R1_Z, &LADDER_R1_Z);
+	buf_flag_lo[CONST_ZERO] = true;
+	buf_flag_hi[CONST_ZERO] = true;
+	buf_flag_lo[CONST_ONE] = true;
+	buf_flag_hi[CONST_ONE] = true;
+	buf_flag_lo[CONST_A24] = true;
+	buf_flag_hi[CONST_A24] = true;
 
+		// initialization
+	uop_load(PX, BANK_HI, LADDER_P_X);
+	uop_move(BANK_HI, CONST_ONE,   CONST_ZERO, BANK_LO, LADDER_R0_X, LADDER_R0_Z);
+	uop_move(BANK_HI, LADDER_P_X, CONST_ONE,  BANK_LO, LADDER_R1_X, LADDER_R1_Z);
 
-	FPGA_WORD k_word;
-	bool k_bit, s = false;
-
+		// ladder
+	s = false;
 	for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--)
 	{
 		for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
 		{
-			k_word = K->words[word_count - 1] >> (bit_count - 1);
-			k_bit = (k_word & (FPGA_WORD)1) == 1;
+			k_word = K->words[word_count - 1] >> (bit_count - 1);	// current word
+			k_bit = (k_word & (FPGA_WORD)1) == 1;					// current bit
 
-			if (s == k_bit)
-				uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_R0_X, UOP_BANK_INT_B, UOP_SRC_INT_LADDER_R0_Z,
-							 UOP_BANK_INT_C, UOP_DST_INT_LADDER_T0_X, UOP_BANK_INT_D, UOP_DST_INT_LADDER_T0_Z);
-			else
-				uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_R1_X, UOP_BANK_INT_B, UOP_SRC_INT_LADDER_R1_Z,
-							 UOP_BANK_INT_C, UOP_DST_INT_LADDER_T0_X, UOP_BANK_INT_D, UOP_DST_INT_LADDER_T0_Z);
+				// inputs are all in LO: R0_X, R0_Z, R1_X, R1_Z
 
+				// swap if needed
 			if (s == k_bit)
-				uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_R1_X, UOP_BANK_INT_B, UOP_SRC_INT_LADDER_R1_Z,
-							 UOP_BANK_INT_C, UOP_DST_INT_LADDER_T1_X, UOP_BANK_INT_D, UOP_DST_INT_LADDER_T1_Z);
+			{	uop_move(BANK_LO, LADDER_R0_X, LADDER_R0_Z, BANK_HI, LADDER_T0_X, LADDER_T0_Z);	// HI: T0_X, T0_Z = LO: R0_X, R0_Z
+				uop_move(BANK_LO, LADDER_R1_X, LADDER_R1_Z, BANK_HI, LADDER_T1_X, LADDER_T1_Z);	// HI: T1_X, T1_Z = LO: R1_X, R1_Z
+			}
 			else
-				uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_R0_X, UOP_BANK_INT_B, UOP_SRC_INT_LADDER_R0_Z,
-							 UOP_BANK_INT_C, UOP_DST_INT_LADDER_T1_X, UOP_BANK_INT_D, UOP_DST_INT_LADDER_T1_Z);
+			{	uop_move(BANK_LO, LADDER_R1_X, LADDER_R1_Z, BANK_HI, LADDER_T0_X, LADDER_T0_Z);	// HI: T0_X, T0_Z = LO: R1_X, R1_Z
+				uop_move(BANK_LO, LADDER_R0_X, LADDER_R0_Z, BANK_HI, LADDER_T1_X, LADDER_T1_Z);	// HI: T1_X, T1_Z = LO: R0_X, R0_Z
+			}
 
+				// remember whether we actually did the swap
 			s = k_bit;
 
-			uop_add		(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_T0_X, UOP_BANK_INT_D, UOP_SRC_INT_LADDER_T0_Z,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_S0,   UOP_BANK_INT_B, UOP_DST_INT_LADDER_S0,
-						 &X25519_2P);
-
-			uop_add		(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_T1_X, UOP_BANK_INT_D, UOP_SRC_INT_LADDER_T1_Z,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_S1,   UOP_BANK_INT_B, UOP_DST_INT_LADDER_S1,
-						 &X25519_2P);
+				// run step
+			uop_calc(ADD, BANK_HI,  LADDER_T0_X, LADDER_T0_Z, BANK_LO, LADDER_S0, MOD_2P);	// LO: S0 = HI: T0_X + T0_Z
+			uop_calc(ADD, BANK_HI,  LADDER_T1_X, LADDER_T1_Z, BANK_LO, LADDER_S1, MOD_2P);	// LO: S1 = HI: T1_X + T1_Z
+			uop_calc(SUB, BANK_HI,  LADDER_T0_X, LADDER_T0_Z, BANK_LO, LADDER_D0, MOD_2P);	// LO: D0 = HI: T0_X - T0_Z
+			uop_calc(SUB, BANK_HI,  LADDER_T1_X, LADDER_T1_Z, BANK_LO, LADDER_D1, MOD_2P);	// LO: D1 = HI: T1_X - T1_Z
 
-			uop_sub		(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_T0_X, UOP_BANK_INT_D, UOP_SRC_INT_LADDER_T0_Z,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_D0,   UOP_BANK_INT_B, UOP_DST_INT_LADDER_D0,
-						 &X25519_2P);
+			uop_calc(MUL, BANK_LO,  LADDER_S0,   LADDER_S0,   BANK_HI, LADDER_QS0, MOD_2P);				// HI: QS0  = LO: S0 * S0
+			uop_calc(MUL, BANK_LO,  LADDER_D0,   LADDER_D0,   BANK_HI, LADDER_QD0, MOD_2P);				// HI: QD0  = LO: D0 * D0
+			uop_calc(MUL, BANK_LO,  LADDER_S0,   LADDER_D1,   BANK_HI, LADDER_S0D1, MOD_2P);				// HI: S0D1 = LO: S0 * D1
+			uop_calc(MUL, BANK_LO,  LADDER_S1,   LADDER_D0,   BANK_HI, LADDER_S1D0, MOD_2P);				// HI: S1D0 = LO: S1 * D0
 
-			uop_sub		(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_T1_X, UOP_BANK_INT_D, UOP_SRC_INT_LADDER_T1_Z,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_D1,   UOP_BANK_INT_B, UOP_DST_INT_LADDER_D1,
-						 &X25519_2P);
+			uop_calc(ADD, BANK_HI,  LADDER_S1D0, LADDER_S0D1, BANK_LO, LADDER_TS, MOD_2P);	// LO: TS = HI: S1D0 + S0D1
+			uop_calc(SUB, BANK_HI,  LADDER_S1D0, LADDER_S0D1, BANK_LO, LADDER_TD, MOD_2P);	// LO: TD = HI: S1D0 - S0D1
 
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_S0,  UOP_BANK_INT_B, UOP_SRC_INT_LADDER_S0,
-						 UOP_BANK_INT_C, UOP_DST_INT_LADDER_QS0, UOP_BANK_INT_D, UOP_DST_INT_LADDER_QS0);
+			uop_calc(MUL, BANK_LO,  LADDER_TD,   LADDER_TD,   BANK_HI, LADDER_QTD, MOD_2P);				// HI: QTD = LO: TD * TD
 
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_D0,  UOP_BANK_INT_B, UOP_SRC_INT_LADDER_D0,
-						 UOP_BANK_INT_C, UOP_DST_INT_LADDER_QD0, UOP_BANK_INT_D, UOP_DST_INT_LADDER_QD0);
-
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_S0,   UOP_BANK_INT_B, UOP_SRC_INT_LADDER_D1,
-						 UOP_BANK_INT_C, UOP_DST_INT_LADDER_S0D1, UOP_BANK_INT_D, UOP_DST_INT_LADDER_S0D1);
-
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_S1,   UOP_BANK_INT_B, UOP_SRC_INT_LADDER_D0,
-						 UOP_BANK_INT_C, UOP_DST_INT_LADDER_S1D0, UOP_BANK_INT_D, UOP_DST_INT_LADDER_S1D0);
-
-			uop_add		(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_S1D0, UOP_BANK_INT_D, UOP_SRC_INT_LADDER_S0D1,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_TS,   UOP_BANK_INT_B, UOP_DST_INT_LADDER_TS,
-						 &X25519_2P);
-
-			uop_sub		(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_S1D0, UOP_BANK_INT_D, UOP_SRC_INT_LADDER_S0D1,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_TD,   UOP_BANK_INT_B, UOP_DST_INT_LADDER_TD,
-						 &X25519_2P);
-
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_TD,   UOP_BANK_INT_B, UOP_SRC_INT_LADDER_TD,
-						 UOP_BANK_INT_C, UOP_DST_INT_LADDER_QTD,  UOP_BANK_INT_D, UOP_DST_INT_LADDER_QTD);
-
-			uop_sub		(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_QS0,  UOP_BANK_INT_D, UOP_SRC_INT_LADDER_QD0,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_T0,   UOP_BANK_INT_B, UOP_DST_INT_LADDER_T0,
-						 &X25519_2P);
-
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_T0,  UOP_BANK_EXT,   UOP_SRC_EXT_A24,
-						 UOP_BANK_INT_C, UOP_DST_INT_LADDER_TA,  UOP_BANK_INT_D, UOP_DST_INT_LADDER_TA);
-
-			uop_add		(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_TA,  UOP_BANK_INT_D, UOP_SRC_INT_LADDER_QD0,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_T1,  UOP_BANK_INT_B, UOP_DST_INT_LADDER_T1,
-						 &X25519_2P);
-
-			uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_QS0,  UOP_BANK_INT_D, UOP_SRC_INT_LADDER_QD0,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_R0_X, UOP_BANK_INT_B, UOP_DST_INT_LADDER_R0_X);
-
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_T0,   UOP_BANK_INT_B, UOP_SRC_INT_LADDER_T1,
-						 UOP_BANK_INT_C, UOP_DST_INT_LADDER_T0_Z, UOP_BANK_INT_D, UOP_DST_INT_LADDER_T0_Z);
-
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_TS,  UOP_BANK_INT_B, UOP_SRC_INT_LADDER_TS,
-						 UOP_BANK_INT_C, UOP_DST_INT_LADDER_T1_X,  UOP_BANK_INT_D, UOP_DST_INT_LADDER_T1_X);
-
-			uop_mul		(UOP_BANK_EXT, UOP_SRC_EXT_X, UOP_BANK_INT_C, UOP_SRC_INT_LADDER_QTD,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_R1_Z,  UOP_BANK_INT_B, UOP_DST_INT_LADDER_R1_Z);
+			uop_calc(SUB, BANK_HI,  LADDER_QS0,  LADDER_QD0,  BANK_LO, LADDER_T0, MOD_2P);	// LO: T0 = HI: QS0 - QD0
+			uop_calc(MUL, BANK_LO,  LADDER_T0,   CONST_A24,   BANK_HI, LADDER_TA, MOD_2P);				// HI: TA = LO: T0 * A24
+			uop_calc(ADD, BANK_HI,  LADDER_TA,   LADDER_QD0,  BANK_LO, LADDER_T1, MOD_2P);	// LO: T1 = HI: TA * QD0
+			
+			uop_calc(MUL, BANK_HI,  LADDER_QS0,  LADDER_QD0,  BANK_LO, LADDER_R0_X, MOD_2P);				// LO: R0_X = HI: QS0 * QD0
+			uop_calc(MUL, BANK_LO,  LADDER_T0,   LADDER_T1,   BANK_HI, LADDER_R0_Z, MOD_2P);				// HI: R0_Z = LO: T0 * T1
+			uop_calc(MUL, BANK_LO,  LADDER_TS,   LADDER_TS,   BANK_HI, LADDER_R1_X, MOD_2P);				// HI: R1_X = LO: TS * TS
+			uop_calc(MUL, BANK_HI,  LADDER_P_X,  LADDER_QTD,  BANK_LO, LADDER_R1_Z, MOD_2P);				// LO: R1_Z = HI: PX * QTD
 
-			uop_move	(UOP_BANK_INT_C, UOP_SRC_INT_LADDER_T1_X, UOP_BANK_INT_D, UOP_SRC_INT_LADDER_T0_Z,
-						 UOP_BANK_INT_A, UOP_DST_INT_LADDER_R1_X, UOP_BANK_INT_B, UOP_DST_INT_LADDER_R0_Z);
+			uop_move(BANK_HI, LADDER_R0_Z, LADDER_R1_X, BANK_LO, LADDER_R0_Z, LADDER_R1_X);	// LO: R0_Z, R1_X = HI: R0_Z, R1_X
 		}
 	}
 	
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_LADDER_R0_X, &LADDER_R0_X);
-	//bank2buffer(UOP_BANK_INT_B, UOP_DST_INT_LADDER_R0_Z, &LADDER_R0_Z);
+		// T_1
+	uop_move(BANK_HI, LADDER_R0_Z, LADDER_R0_Z, BANK_LO, REDUCE_T_1, REDUCE_T_1);
+	uop_move(BANK_LO, REDUCE_T_1, REDUCE_T_1, BANK_HI, REDUCE_T_1, REDUCE_T_1);
 
-		// since the lower three bits of the private key are always 000,
-		// the result is in R0X, R0z
+		// T_10
+	uop_calc(MUL, BANK_LO, REDUCE_T_1, REDUCE_T_1, BANK_HI, REDUCE_T_10, MOD_2P);
 
-		// conversion to affine coordinates
-	//fpga_multiword_copy(&LADDER_R0_Z, &REDUCE_T_1);
+		// T_1001
+	uop_calc(MUL, BANK_HI, REDUCE_T_10, REDUCE_T_10, BANK_LO, REDUCE_R1, MOD_2P);
+	uop_calc(MUL, BANK_LO, REDUCE_R1, REDUCE_R1, BANK_HI, REDUCE_R2, MOD_2P);
+	uop_calc(MUL, BANK_HI, REDUCE_R2, REDUCE_T_1, BANK_LO, REDUCE_T_1001, MOD_2P);
 
-	uop_add		(UOP_BANK_INT_B, UOP_SRC_INT_LADDER_R0_Z, UOP_BANK_EXT, UOP_SRC_EXT_ZERO,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_T_1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_T_1,
-				 &X25519_2P);
+		// T_1011
+	uop_move(BANK_HI, REDUCE_T_10, REDUCE_T_10, BANK_LO, REDUCE_T_10, REDUCE_T_10);
+	uop_calc(MUL, BANK_LO, REDUCE_T_1001, REDUCE_T_10, BANK_HI, REDUCE_T_1011, MOD_2P);
 
-	uop_move	(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_T_1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_T_1,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_1, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_T_1);
+		// T_X5
+	uop_calc(MUL, BANK_HI, REDUCE_T_1011, REDUCE_T_1011, BANK_LO, REDUCE_R1, MOD_2P);
+	uop_calc(MUL, BANK_LO, REDUCE_R1, REDUCE_T_1001, BANK_HI, REDUCE_T_X5, MOD_2P);
 
-	//uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_R0_X, UOP_BANK_INT_B, UOP_SRC_INT_LADDER_R0_Z,
-		//		 UOP_BANK_INT_C, UOP_DST_INT_LADDER_T0_X, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_T_1);
-	
-
-	//
-	//fpga_modular_mul(&REDUCE_T_1, &REDUCE_T_1, &REDUCE_T_10, &X25519_2P);
-	//
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_T_1,  UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_T_1,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_10, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_T_10);
-
-
-//	fpga_modular_mul(&REDUCE_T_10, &REDUCE_T_10, &REDUCE_R1, &X25519_2P);
-//	fpga_modular_mul(&REDUCE_R1, &REDUCE_R1, &REDUCE_R2, &X25519_2P);
-//	fpga_modular_mul(&REDUCE_R2, &REDUCE_T_1, &REDUCE_T_1001, &X25519_2P);
-
-	uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_10, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_10,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1,   UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-
-	uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2,    UOP_BANK_INT_B,  UOP_SRC_INT_REDUCE_T_1,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_T_1001, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_T_1001);
-
-
-	//
-	//fpga_modular_mul(&REDUCE_T_1001, &REDUCE_T_10, &REDUCE_T_1011, &X25519_2P);
-	//
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_10, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_10,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_T_10, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_T_10);
-
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_T_1001, UOP_BANK_INT_D,  UOP_SRC_INT_REDUCE_T_10,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_1011, UOP_BANK_INT_B,  UOP_DST_INT_REDUCE_T_1011);
-
-
-
-	//fpga_modular_mul(&REDUCE_T_1011, &REDUCE_T_1011, &REDUCE_R1, &X25519_2P);
-	//fpga_modular_mul(&REDUCE_R1, &REDUCE_T_1001, &REDUCE_T_X5, &X25519_2P);
-
-	uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_1011, UOP_BANK_INT_B,  UOP_SRC_INT_REDUCE_T_1011,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D,      UOP_DST_INT_REDUCE_R1);
-
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1,   UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_T_1001,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X5, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_T_X5);
-
-
-
-
-	//
-	//fpga_multiword_copy(&REDUCE_T_X5, &REDUCE_R1);
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_X5, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_X5,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-
-	//bank2buffer(UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, &REDUCE_R1);
-
-	for (cyc_count=0; cyc_count<5; cyc_count++)
-	{	if (!(cyc_count % 2))
-	
-			//fpga_modular_mul(&REDUCE_R1, &REDUCE_R1, &REDUCE_R2, &X25519_2P);
-
-			uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-						 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-		else		
-			//fpga_modular_mul(&REDUCE_R2, &REDUCE_R2, &REDUCE_R1, &X25519_2P);
-			
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-						 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-	}
+		// T_X10
+	uop_move(BANK_HI, REDUCE_T_X5, REDUCE_T_X5, BANK_LO, REDUCE_R1, REDUCE_R1);
 
+	for (cyc_count=0; cyc_count<4; cyc_count++)
+		if (!(cyc_count % 2))	uop_calc(MUL, BANK_LO, REDUCE_R1, REDUCE_R1, BANK_HI, REDUCE_R2, MOD_2P);
+		else					uop_calc(MUL, BANK_HI, REDUCE_R2, REDUCE_R2, BANK_LO, REDUCE_R1, MOD_2P);
 
-	//fpga_modular_mul(&REDUCE_R2, &REDUCE_T_X5, &REDUCE_T_X10, &X25519_2P);
-	//
-	uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2,    UOP_BANK_INT_B,  UOP_SRC_INT_REDUCE_T_X5,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_T_X10, UOP_BANK_INT_D,  UOP_DST_INT_REDUCE_T_X10);
+	uop_calc(MUL, BANK_LO, REDUCE_R1, REDUCE_R1, BANK_HI, REDUCE_R2, MOD_2P);
+	uop_calc(MUL, BANK_HI, REDUCE_R2, REDUCE_T_X5, BANK_LO, REDUCE_T_X10, MOD_2P);
 
-
-
-	//fpga_multiword_copy(&REDUCE_T_X10, &REDUCE_R1);
-	//for (cyc_count=0; cyc_count<10; cyc_count++)
-	//{	if (!(cyc_count % 2))	fpga_modular_mul(&REDUCE_R1, &REDUCE_R1, &REDUCE_R2, &X25519_2P);
-		//else			fpga_modular_mul(&REDUCE_R2, &REDUCE_R2, &REDUCE_R1, &X25519_2P);
-	//}
-	//fpga_modular_mul(&REDUCE_R1, &REDUCE_T_X10, &REDUCE_T_X20, &X25519_2P);
-
-	uop_move	(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_T_X10, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_T_X10,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-	
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
+		// T_X20
+	uop_move(BANK_LO, REDUCE_T_X10, REDUCE_T_X10, BANK_HI, REDUCE_R1, REDUCE_R1);
+	uop_move(BANK_LO, REDUCE_T_X10, REDUCE_T_X10, BANK_HI, REDUCE_T_X10, REDUCE_T_X10);
 
 	for (cyc_count=0; cyc_count<10; cyc_count++)
-	{	if (!(cyc_count % 2))
-			uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-						 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-		else		
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-						 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-	}
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_T_X10, UOP_BANK_INT_D,  UOP_SRC_INT_REDUCE_R1,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X20, UOP_BANK_INT_B,  UOP_DST_INT_REDUCE_T_X20);
-
-
+		if (!(cyc_count % 2))	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_R1, BANK_LO, REDUCE_R2, MOD_2P);
+		else					uop_calc(MUL, BANK_LO, REDUCE_R2, REDUCE_R2, BANK_HI, REDUCE_R1, MOD_2P);
 
+	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_T_X10, BANK_LO, REDUCE_T_X20, MOD_2P);
 
-	
-	
-	//
-	//fpga_multiword_copy(&REDUCE_T_X20, &REDUCE_R1);
-	//for (cyc_count=0; cyc_count<20; cyc_count++)
-	//{	if (!(cyc_count % 2))	fpga_modular_mul(&REDUCE_R1, &REDUCE_R1, &REDUCE_R2, &X25519_2P);
-		//else			fpga_modular_mul(&REDUCE_R2, &REDUCE_R2, &REDUCE_R1, &X25519_2P);
-	//}
-	//fpga_modular_mul(&REDUCE_R1, &REDUCE_T_X20, &REDUCE_T_X40, &X25519_2P);
-
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_X20, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_X20,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_X20, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_X20,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_T_X20, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_T_X20);
+		// T_X40
+	uop_move(BANK_LO, REDUCE_T_X20, REDUCE_T_X20, BANK_HI, REDUCE_R1, REDUCE_R1);
+	uop_move(BANK_LO, REDUCE_T_X20, REDUCE_T_X20, BANK_HI, REDUCE_T_X20, REDUCE_T_X20);
 
 	for (cyc_count=0; cyc_count<20; cyc_count++)
-	{	if (!(cyc_count % 2))
-			uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-						 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-		else		
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-						 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-	}
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_T_X20, UOP_BANK_INT_D,  UOP_SRC_INT_REDUCE_R1,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X40, UOP_BANK_INT_B,  UOP_DST_INT_REDUCE_T_X40);
+		if (!(cyc_count % 2))	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_R1, BANK_LO, REDUCE_R2, MOD_2P);
+		else					uop_calc(MUL, BANK_LO, REDUCE_R2, REDUCE_R2, BANK_HI, REDUCE_R1, MOD_2P);
 
-	
-	//
-//	fpga_multiword_copy(&REDUCE_T_X40, &REDUCE_R1);
-//	for (cyc_count=0; cyc_count<10; cyc_count++)
-//	{	if (!(cyc_count % 2))	fpga_modular_mul(&REDUCE_R1, &REDUCE_R1, &REDUCE_R2, &X25519_2P);
-//		else			fpga_modular_mul(&REDUCE_R2, &REDUCE_R2, &REDUCE_R1, &X25519_2P);
-//	}
-//	fpga_modular_mul(&REDUCE_R1, &REDUCE_T_X10, &REDUCE_T_X50, &X25519_2P);
-	//
-	
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_X40, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_X40,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
+	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_T_X20, BANK_LO, REDUCE_T_X40, MOD_2P);
 
-	//uop_move	(UOP_BANK_INT_, UOP_SRC_INT_REDUCE_, UOP_BANK_INT_, UOP_SRC_INT_REDUCE_,
-		//		 UOP_BANK_INT_, UOP_DST_INT_REDUCE_, UOP_BANK_INT_, UOP_DST_INT_REDUCE_);
+		// T_X50		
+	uop_move(BANK_LO, REDUCE_T_X40, REDUCE_T_X40, BANK_HI, REDUCE_R1, REDUCE_R1);
 
 	for (cyc_count=0; cyc_count<10; cyc_count++)
-	{	if (!(cyc_count % 2))
-			uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-						 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-		else		
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-						 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-	}
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D,  UOP_SRC_INT_REDUCE_T_X10,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X50, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_T_X50);
-
-
+		if (!(cyc_count % 2))	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_R1, BANK_LO, REDUCE_R2, MOD_2P);
+		else					uop_calc(MUL, BANK_LO, REDUCE_R2, REDUCE_R2, BANK_HI, REDUCE_R1, MOD_2P);
 
+	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_T_X10, BANK_LO, REDUCE_T_X50, MOD_2P);
 
-
-
-
-	
-//	fpga_multiword_copy(&REDUCE_T_X50, &REDUCE_R1);
-//	for (cyc_count=0; cyc_count<50; cyc_count++)
-//	{	if (!(cyc_count % 2))	fpga_modular_mul(&REDUCE_R1, &REDUCE_R1, &REDUCE_R2, &X25519_2P);
-//		else			fpga_modular_mul(&REDUCE_R2, &REDUCE_R2, &REDUCE_R1, &X25519_2P);
-//	}
-//	fpga_modular_mul(&REDUCE_R1, &REDUCE_T_X50, &REDUCE_T_X100, &X25519_2P);
-	//
-	
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_X50, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_X50,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_X50, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_X50,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_T_X50, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_T_X50);
+		// T_X100
+	uop_move(BANK_LO, REDUCE_T_X50, REDUCE_T_X50, BANK_HI, REDUCE_R1, REDUCE_R1);
+	uop_move(BANK_LO, REDUCE_T_X50, REDUCE_T_X50, BANK_HI, REDUCE_T_X50, REDUCE_T_X50);
 
 	for (cyc_count=0; cyc_count<50; cyc_count++)
-	{	if (!(cyc_count % 2))
-			uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-						 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-		else		
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-						 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-	}
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D,  UOP_SRC_INT_REDUCE_T_X50,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X100, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_T_X100);
-
+		if (!(cyc_count % 2))	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_R1, BANK_LO, REDUCE_R2, MOD_2P);
+		else					uop_calc(MUL, BANK_LO, REDUCE_R2, REDUCE_R2, BANK_HI, REDUCE_R1, MOD_2P);
 
+	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_T_X50, BANK_LO, REDUCE_T_X100, MOD_2P);
 
-
-
-
-
-
-	
-	//fpga_multiword_copy(&REDUCE_T_X100, &REDUCE_R1);
-	//for (cyc_count=0; cyc_count<100; cyc_count++)
-	//{	if (!(cyc_count % 2))	fpga_modular_mul(&REDUCE_R1, &REDUCE_R1, &REDUCE_R2, &X25519_2P);
-		//else			fpga_modular_mul(&REDUCE_R2, &REDUCE_R2, &REDUCE_R1, &X25519_2P);
-	//}
-	//
-	//fpga_modular_mul(&REDUCE_R1, &REDUCE_T_X100, &REDUCE_R2, &X25519_2P);
-	//
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_X100, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_X100,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1,     UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_T_X100, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_X100,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_T_X100, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_T_X100);
+	uop_move(BANK_LO, REDUCE_T_X100, REDUCE_T_X100, BANK_HI, REDUCE_R1, REDUCE_R1);
+	uop_move(BANK_LO, REDUCE_T_X100, REDUCE_T_X100, BANK_HI, REDUCE_T_X100, REDUCE_T_X100);
 
 	for (cyc_count=0; cyc_count<100; cyc_count++)
-	{	if (!(cyc_count % 2))
-			uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-						 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-		else		
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-						 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-	}
+		if (!(cyc_count % 2))	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_R1, BANK_LO, REDUCE_R2, MOD_2P);
+		else					uop_calc(MUL, BANK_LO, REDUCE_R2, REDUCE_R2, BANK_HI, REDUCE_R1, MOD_2P);
 
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_T_X100,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
+	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_T_X100, BANK_LO, REDUCE_R2, MOD_2P);
 
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-
-	
-	
-//	for (cyc_count=0; cyc_count<50; cyc_count++)
-//	{	if ((cyc_count % 2))	fpga_modular_mul(&REDUCE_R1, &REDUCE_R1, &REDUCE_R2, &X25519_2P);	// !!! (swapped sides)
-//		else			fpga_modular_mul(&REDUCE_R2, &REDUCE_R2, &REDUCE_R1, &X25519_2P);
-//	}
-	//
 	for (cyc_count=0; cyc_count<50; cyc_count++)
-	{	if (!(cyc_count % 2))
-			uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-						 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-		else		
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-						 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-	}
-
-
+		if (!(cyc_count % 2))	uop_calc(MUL, BANK_LO, REDUCE_R2, REDUCE_R2, BANK_HI, REDUCE_R1, MOD_2P);
+		else					uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_R1, BANK_LO, REDUCE_R2, MOD_2P);
 
+	uop_calc(MUL, BANK_LO, REDUCE_R2, REDUCE_T_X50, BANK_HI, REDUCE_R1, MOD_2P);
 	
-	//fpga_modular_mul(&REDUCE_R2, &REDUCE_T_X50, &REDUCE_R1, &X25519_2P);
-	//fpga_multiword_copy(&REDUCE_R1, &REDUCE_R2);
+	for (cyc_count=0; cyc_count<4; cyc_count++)
+		if (!(cyc_count % 2))	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_R1, BANK_LO, REDUCE_R2, MOD_2P);
+		else					uop_calc(MUL, BANK_LO, REDUCE_R2, REDUCE_R2, BANK_HI, REDUCE_R1, MOD_2P);
 	
-	uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_T_X50,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-
-
-	//for (cyc_count=0; cyc_count<5; cyc_count++)
-	//{	if (!(cyc_count % 2))	fpga_modular_mul(&REDUCE_R1, &REDUCE_R1, &REDUCE_R2, &X25519_2P);
-	//	else			fpga_modular_mul(&REDUCE_R2, &REDUCE_R2, &REDUCE_R1, &X25519_2P);
-	//}
-
-	for (cyc_count=0; cyc_count<5; cyc_count++)
-	{	if (!(cyc_count % 2))
-			uop_mul		(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-						 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
-		else		
-			uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_R2,
-						 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-	}
-
-
-	// ��������� � R2
-
-	//
-	//fpga_modular_mul(&REDUCE_R2, &REDUCE_T_1011, &REDUCE_R1, &X25519_2P);
-	//fpga_modular_mul(&LADDER_R0_X, &REDUCE_R1, &REDUCE_R2, &X25519_2P);
-
-
-	uop_mul		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_SRC_INT_REDUCE_T_1011,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1);
-
-	uop_move	(UOP_BANK_INT_A, UOP_SRC_INT_LADDER_R0_X, UOP_BANK_INT_B, UOP_SRC_INT_LADDER_R0_Z,
-				 UOP_BANK_INT_C, UOP_DST_INT_LADDER_R0_X, UOP_BANK_INT_D, UOP_DST_INT_LADDER_R0_Z);
-
-	uop_mul		(UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_C, UOP_SRC_INT_LADDER_R0_X,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_INT_B, UOP_DST_INT_REDUCE_R2);
+	uop_calc(MUL, BANK_HI, REDUCE_R1, REDUCE_R1, BANK_LO, REDUCE_R2, MOD_2P);
+	uop_move(BANK_HI, REDUCE_T_1011, REDUCE_T_1011, BANK_LO, REDUCE_T_1011, REDUCE_T_X100);
+	uop_calc(MUL, BANK_LO, REDUCE_R2, REDUCE_T_1011, BANK_HI, REDUCE_R2, MOD_2P);
+	uop_move(BANK_HI, REDUCE_R2, REDUCE_R2, BANK_LO, REDUCE_R2, REDUCE_R2);
 
+	uop_calc(MUL, BANK_LO, REDUCE_R2, LADDER_R0_X, BANK_HI, REDUCE_R1, MOD_2P);
 
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_1,    &REDUCE_T_1);
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_10,   &REDUCE_T_10);
-	//bank2buffer(UOP_BANK_INT_C, UOP_DST_INT_REDUCE_T_1001, &REDUCE_T_1001);
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_1011, &REDUCE_T_1011);
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X5,   &REDUCE_T_X5);
-	//bank2buffer(UOP_BANK_INT_C, UOP_DST_INT_REDUCE_T_X10,  &REDUCE_T_X10);
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X20,  &REDUCE_T_X20);
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X40,  &REDUCE_T_X40);
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X50,  &REDUCE_T_X50);
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_T_X100, &REDUCE_T_X100);
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2,     &REDUCE_R2);
-
-
-	//bank2buffer(UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, &REDUCE_R2);
-
-	//
-	//fpga_modular_add(&REDUCE_R2, &X25519_ZERO, QX, &X25519_1P);	// 1P!
-
-	uop_add		(UOP_BANK_INT_A, UOP_SRC_INT_REDUCE_R2, UOP_BANK_EXT, UOP_SRC_EXT_ZERO,
-				 UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_DST_INT_REDUCE_R1,
-				 &X25519_1P);
-
-	uop_move	(UOP_BANK_INT_C, UOP_SRC_INT_REDUCE_R1, UOP_BANK_INT_D, UOP_SRC_INT_REDUCE_R1,
-				 UOP_BANK_INT_A, UOP_DST_INT_REDUCE_R2, UOP_BANK_EXT, UOP_DST_EXT_Y);
-
-	// DST_EXT_Y -> QX
-	bank2buffer(UOP_BANK_INT_C, UOP_DST_INT_REDUCE_R1, QX);
+	// finally reduce to just 1*P
+	uop_calc(ADD, BANK_HI, REDUCE_R1, CONST_ZERO, BANK_LO, REDUCE_R2, MOD_1P);	// !!!
 
+	uop_stor(BANK_LO, REDUCE_R2, QX);
 }
 
 
 //------------------------------------------------------------------------------
-static void uop_move	(UOP_BANK src_bank_x, UOP_SRC_OPERAND src_operand_x,
-						 UOP_BANK src_bank_y, UOP_SRC_OPERAND src_operand_y,
-						 UOP_BANK dst_bank_x, UOP_DST_OPERAND dst_operand_x,
-						 UOP_BANK dst_bank_y, UOP_DST_OPERAND dst_operand_y)
+static void uop_move	(UOP_BANK src, UOP_OPERAND s_op1, UOP_OPERAND s_op2,
+						 UOP_BANK dst, UOP_OPERAND d_op1, UOP_OPERAND d_op2)
 //------------------------------------------------------------------------------
 {
-	if ((src_bank_x != UOP_BANK_EXT) && (src_bank_x == src_bank_y)) uop_fatal("ERROR: uop_move(): src_bank_x == src_bank_y!");
-	if ((src_bank_y != UOP_BANK_EXT) && (src_bank_y == src_bank_x)) uop_fatal("ERROR: uop_move(): src_bank_y == src_bank_x!");
-
-	if (dst_bank_x == dst_bank_y) uop_fatal("ERROR: uop_move(): dst_bank_x == dst_bank_y!");
+	FPGA_BUFFER *s_ptr1 = NULL;
+	FPGA_BUFFER *s_ptr2 = NULL;
+	FPGA_BUFFER *d_ptr1 = NULL;
+	FPGA_BUFFER *d_ptr2 = NULL;
 
-	if (src_bank_x == dst_bank_x) uop_fatal("ERROR: uop_move(): src_bank_x == dst_bank_x!");
-	if (src_bank_x == dst_bank_y) uop_fatal("ERROR: uop_move(): src_bank_x == dst_bank_y!");
-
-	if (src_bank_y == dst_bank_x) uop_fatal("ERROR: uop_move(): src_bank_y == dst_bank_x!");
-	if (src_bank_y == dst_bank_y) uop_fatal("ERROR: uop_move(): src_bank_y == dst_bank_y!");
-
-	FPGA_BUFFER *ptr_src_x, *ptr_dst_x;
-	FPGA_BUFFER *ptr_src_y, *ptr_dst_y;
-
-	if (src_bank_x != UOP_BANK_EXT) ptr_src_x = &BANK_INT[src_bank_x][src_operand_x];
-	if (src_bank_y != UOP_BANK_EXT) ptr_src_y = &BANK_INT[src_bank_y][src_operand_y];
-
-	if (src_bank_x == UOP_BANK_EXT)
-	{	switch(src_operand_x)
-		{	case UOP_SRC_EXT_ZERO:	ptr_src_x = &X25519_ZERO; break;
-			case UOP_SRC_EXT_ONE:	ptr_src_x = &X25519_ONE; break;
-			case UOP_SRC_EXT_X:		ptr_src_x = &BANK_EXT_X; break;
-			case UOP_SRC_EXT_A24:	ptr_src_x = &X25519_A24; break;
-		}
+		// same bank?
+	if (src == dst) uop_fatal("ERROR: uop_move(): src == dst");
+	
+		// same operands?
+	//if (s_op1 == d_op1) uop_fatal("ERROR: uop_move(): s_op1 == s_op2");
+	//if (d_op1 == d_op2) uop_fatal("ERROR: uop_move(): d_op1 == d_op2");
+
+		// source filled?
+	if (src == BANK_LO)
+	{	if (!buf_flag_lo[s_op1])
+			uop_fatal("ERROR: uop_move(): !buf_flag_lo[s_op1]");
+		if (!buf_flag_lo[s_op2])
+			uop_fatal("ERROR: uop_move(): !buf_flag_lo[s_op2]");
+		s_ptr1 = &BUF_LO[s_op1];
+		s_ptr2 = &BUF_LO[s_op2];
 	}
-
-	if (src_bank_y == UOP_BANK_EXT)
-	{	switch(src_operand_y)
-		{	case UOP_SRC_EXT_ZERO:	ptr_src_y = &X25519_ZERO; break;
-			case UOP_SRC_EXT_ONE:	ptr_src_y = &X25519_ONE; break;
-			case UOP_SRC_EXT_X:		ptr_src_y = &BANK_EXT_X; break;
-			case UOP_SRC_EXT_A24:	ptr_src_y = &X25519_A24; break;
-		}
+	if (src == BANK_HI)
+	{	if (!buf_flag_hi[s_op1])
+			uop_fatal("ERROR: uop_move(): !buf_flag_hi[s_op1]");
+		if (!buf_flag_hi[s_op2])
+			uop_fatal("ERROR: uop_move(): !buf_flag_hi[s_op2]");
+		s_ptr1 = &BUF_HI[s_op1];
+		s_ptr2 = &BUF_HI[s_op2];
 	}
 
-	if (dst_bank_x != UOP_BANK_EXT) ptr_dst_x = &BANK_INT[dst_bank_x][dst_operand_x];
-	if (dst_bank_y != UOP_BANK_EXT) ptr_dst_y = &BANK_INT[dst_bank_y][dst_operand_y];
-
-	if (dst_bank_x == UOP_BANK_EXT)
-	{	switch(dst_operand_x)
-		{	case UOP_DST_EXT_Y:		ptr_dst_x = &BANK_EXT_Y; break;
-		}
+	if (d_op1 == CONST_ZERO) uop_fatal("ERROR: uop_move(): d_op1 == CONST_ZERO");
+	if (d_op2 == CONST_ZERO) uop_fatal("ERROR: uop_move(): d_op2 == CONST_ZERO");
+	if (d_op1 == CONST_ONE) uop_fatal("ERROR: uop_move(): d_op1 == CONST_ONE");
+	if (d_op2 == CONST_ONE) uop_fatal("ERROR: uop_move(): d_op2 == CONST_ONE");
+	if (d_op1 == CONST_A24) uop_fatal("ERROR: uop_move(): d_op1 == CONST_A24");
+	if (d_op2 == CONST_A24) uop_fatal("ERROR: uop_move(): d_op2 == CONST_A24");
+
+	if (dst == BANK_LO)
+	{	buf_flag_lo[d_op1] = true;
+		buf_flag_lo[d_op2] = true;
+		d_ptr1 = &BUF_LO[d_op1];
+		d_ptr2 = &BUF_LO[d_op2];
 	}
-
-	if (dst_bank_y == UOP_BANK_EXT)
-	{	switch(dst_operand_y)
-		{	case UOP_DST_EXT_Y:		ptr_dst_y = &BANK_EXT_Y; break;
-		}
+	if (dst == BANK_HI)
+	{	buf_flag_hi[d_op1] = true;
+		buf_flag_hi[d_op2] = true;
+		d_ptr1 = &BUF_HI[d_op1];
+		d_ptr2 = &BUF_HI[d_op2];
 	}
 
-	fpga_multiword_copy(ptr_src_x, ptr_dst_x);
-	fpga_multiword_copy(ptr_src_y, ptr_dst_y);
-
-	if (dst_bank_x != UOP_BANK_EXT)	bank_flags[dst_bank_x][dst_operand_x] = true;
-	if (dst_bank_y != UOP_BANK_EXT)	bank_flags[dst_bank_y][dst_operand_y] = true;
+	fpga_multiword_copy(s_ptr1, d_ptr1);
+	fpga_multiword_copy(s_ptr2, d_ptr2);
 }
 
 
 //------------------------------------------------------------------------------
-static void uop_add		(UOP_BANK src_bank_x, UOP_SRC_OPERAND src_operand_x,
-						 UOP_BANK src_bank_y, UOP_SRC_OPERAND src_operand_y,
-						 UOP_BANK dst_bank_x, UOP_DST_OPERAND dst_operand_x,
-						 UOP_BANK dst_bank_y, UOP_DST_OPERAND dst_operand_y,
-						 FPGA_BUFFER *modulus)
+static void uop_calc	(UOP_MATH math,
+						 UOP_BANK src, UOP_OPERAND s_op1, UOP_OPERAND s_op2,
+						 UOP_BANK dst, UOP_OPERAND d_op,
+						 UOP_MODULUS mod)
 //------------------------------------------------------------------------------
 {
-	if ((src_bank_x != UOP_BANK_EXT) && (src_bank_x == src_bank_y)) uop_fatal("ERROR: uop_move(): src_bank_x == src_bank_y!");
-	if ((src_bank_y != UOP_BANK_EXT) && (src_bank_y == src_bank_x)) uop_fatal("ERROR: uop_move(): src_bank_y == src_bank_x!");
-
-	if (dst_bank_x == dst_bank_y) uop_fatal("ERROR: uop_move(): dst_bank_x == dst_bank_y!");
-
-	if (src_bank_x == dst_bank_x) uop_fatal("ERROR: uop_move(): src_bank_x == dst_bank_x!");
-	if (src_bank_x == dst_bank_y) uop_fatal("ERROR: uop_move(): src_bank_x == dst_bank_y!");
-
-	if (src_bank_y == dst_bank_x) uop_fatal("ERROR: uop_move(): src_bank_y == dst_bank_x!");
-	if (src_bank_y == dst_bank_y) uop_fatal("ERROR: uop_move(): src_bank_y == dst_bank_y!");
-
-	FPGA_BUFFER *ptr_src_x, *ptr_dst_x;
-	FPGA_BUFFER *ptr_src_y, *ptr_dst_y;
-
-	if (src_bank_x != UOP_BANK_EXT) ptr_src_x = &BANK_INT[src_bank_x][src_operand_x];
-	if (src_bank_y != UOP_BANK_EXT) ptr_src_y = &BANK_INT[src_bank_y][src_operand_y];
-
-	if (src_bank_x == UOP_BANK_EXT)
-	{	switch(src_operand_x)
-		{	case UOP_SRC_EXT_ZERO:	ptr_src_x = &X25519_ZERO; break;
-			case UOP_SRC_EXT_ONE:	ptr_src_x = &X25519_ONE; break;
-			case UOP_SRC_EXT_X:		ptr_src_x = &BANK_EXT_X; break;
-			case UOP_SRC_EXT_A24:	ptr_src_x = &X25519_A24; break;
-		}
+	FPGA_BUFFER *s_ptr1 = NULL;
+	FPGA_BUFFER *s_ptr2 = NULL;
+	FPGA_BUFFER *d_ptr = NULL;
+	FPGA_BUFFER *n_ptr = NULL;
+
+		// same bank?
+	if (src == dst)
+		uop_fatal("ERROR: uop_calc(): src == dst");
+	
+		// same operands?
+	//if (s_op1 == s_op2)
+		//uop_fatal("ERROR: uop_calc(): s_op1 == s_op2");
+
+		// sources filled?
+	if (src == BANK_LO)
+	{	if (!buf_flag_lo[s_op1])
+			uop_fatal("ERROR: uop_calc(): !buf_flag_lo[s_op1]");
+		if (!buf_flag_lo[s_op2])
+			uop_fatal("ERROR: uop_calc(): !buf_flag_lo[s_op2]");
+		s_ptr1 = &BUF_LO[s_op1];
+		s_ptr2 = &BUF_LO[s_op2];
 	}
-
-	if (src_bank_y == UOP_BANK_EXT)
-	{	switch(src_operand_y)
-		{	case UOP_SRC_EXT_ZERO:	ptr_src_y = &X25519_ZERO; break;
-			case UOP_SRC_EXT_ONE:	ptr_src_y = &X25519_ONE; break;
-			case UOP_SRC_EXT_X:		ptr_src_y = &BANK_EXT_X; break;
-			case UOP_SRC_EXT_A24:	ptr_src_y = &X25519_A24; break;
-		}
+	if (src == BANK_HI)
+	{	if (!buf_flag_hi[s_op1])
+			uop_fatal("ERROR: uop_calc(): !buf_flag_hi[s_op1]");
+		if (!buf_flag_hi[s_op2])
+			uop_fatal("ERROR: uop_calc(): !buf_flag_hi[s_op2]");
+		s_ptr1 = &BUF_HI[s_op1];
+		s_ptr2 = &BUF_HI[s_op2];
 	}
 
-	if (dst_bank_x != UOP_BANK_EXT) ptr_dst_x = &BANK_INT[dst_bank_x][dst_operand_x];
-	if (dst_bank_y != UOP_BANK_EXT) ptr_dst_y = &BANK_INT[dst_bank_y][dst_operand_y];
+	if (d_op == CONST_ZERO) uop_fatal("ERROR: uop_calc(): d_op == CONST_ZERO");
+	if (d_op == CONST_ONE) uop_fatal("ERROR: uop_calc(): d_op == CONST_ONE");
+	if (d_op == CONST_A24) uop_fatal("ERROR: uop_calc(): d_op == CONST_A24");
 
-	if (dst_bank_x == UOP_BANK_EXT)
-	{	switch(dst_operand_x)
-		{	case UOP_DST_EXT_Y:		ptr_dst_x = &BANK_EXT_Y; break;
-		}
+	if (dst == BANK_LO)
+	{	buf_flag_lo[d_op] = true;
+		d_ptr = &BUF_LO[d_op];
 	}
-
-	if (dst_bank_y == UOP_BANK_EXT)
-	{	switch(dst_operand_y)
-		{	case UOP_DST_EXT_Y:		ptr_dst_y = &BANK_EXT_Y; break;
-		}
+	if (dst == BANK_HI)
+	{	buf_flag_hi[d_op] = true;
+		d_ptr = &BUF_HI[d_op];
 	}
 
-	FPGA_BUFFER S;
-	fpga_modular_add(ptr_src_x, ptr_src_y, &S, modulus);
-	fpga_multiword_copy(&S, ptr_dst_x);
-	fpga_multiword_copy(&S, ptr_dst_y);
+	if (mod == MOD_1P) n_ptr = &X25519_1P;
+	if (mod == MOD_2P) n_ptr = &X25519_2P;
 
-	if (dst_bank_x != UOP_BANK_EXT)	bank_flags[dst_bank_x][dst_operand_x] = true;
-	if (dst_bank_y != UOP_BANK_EXT)	bank_flags[dst_bank_y][dst_operand_y] = true;
+	if (math == ADD) fpga_modular_add(s_ptr1, s_ptr2, d_ptr, n_ptr);
+	if (math == SUB) fpga_modular_sub(s_ptr1, s_ptr2, d_ptr, n_ptr);
+	if (math == MUL) fpga_modular_mul(s_ptr1, s_ptr2, d_ptr, n_ptr);
 }
 
 
 //------------------------------------------------------------------------------
-static void uop_sub		(UOP_BANK src_bank_x, UOP_SRC_OPERAND src_operand_x,
-						 UOP_BANK src_bank_y, UOP_SRC_OPERAND src_operand_y,
-						 UOP_BANK dst_bank_x, UOP_DST_OPERAND dst_operand_x,
-						 UOP_BANK dst_bank_y, UOP_DST_OPERAND dst_operand_y,
-						 FPGA_BUFFER *modulus)
+static void uop_load(FPGA_BUFFER *mem, UOP_BANK dst, UOP_OPERAND d_op)
 //------------------------------------------------------------------------------
 {
-	if ((src_bank_x != UOP_BANK_EXT) && (src_bank_x == src_bank_y)) uop_fatal("ERROR: uop_move(): src_bank_x == src_bank_y!");
-	if ((src_bank_y != UOP_BANK_EXT) && (src_bank_y == src_bank_x)) uop_fatal("ERROR: uop_move(): src_bank_y == src_bank_x!");
-
-	if (dst_bank_x == dst_bank_y) uop_fatal("ERROR: uop_move(): dst_bank_x == dst_bank_y!");
-
-	if (src_bank_x == dst_bank_x) uop_fatal("ERROR: uop_move(): src_bank_x == dst_bank_x!");
-	if (src_bank_x == dst_bank_y) uop_fatal("ERROR: uop_move(): src_bank_x == dst_bank_y!");
-
-	if (src_bank_y == dst_bank_x) uop_fatal("ERROR: uop_move(): src_bank_y == dst_bank_x!");
-	if (src_bank_y == dst_bank_y) uop_fatal("ERROR: uop_move(): src_bank_y == dst_bank_y!");
-
-	FPGA_BUFFER *ptr_src_x, *ptr_dst_x;
-	FPGA_BUFFER *ptr_src_y, *ptr_dst_y;
-
-	if (src_bank_x != UOP_BANK_EXT) ptr_src_x = &BANK_INT[src_bank_x][src_operand_x];
-	if (src_bank_y != UOP_BANK_EXT) ptr_src_y = &BANK_INT[src_bank_y][src_operand_y];
-
-	if (src_bank_x == UOP_BANK_EXT)
-	{	switch(src_operand_x)
-		{	case UOP_SRC_EXT_ZERO:	ptr_src_x = &X25519_ZERO; break;
-			case UOP_SRC_EXT_ONE:	ptr_src_x = &X25519_ONE; break;
-			case UOP_SRC_EXT_X:		ptr_src_x = &BANK_EXT_X; break;
-			case UOP_SRC_EXT_A24:	ptr_src_x = &X25519_A24; break;
-		}
+	if (d_op == CONST_ZERO) uop_fatal("ERROR: uop_load(): d_op1 == CONST_ZERO");
+	if (d_op == CONST_ONE) uop_fatal("ERROR: uop_load(): d_op1 == CONST_ONE");
+	if (d_op == CONST_A24) uop_fatal("ERROR: uop_load(): d_op1 == CONST_A24");
+
+	FPGA_BUFFER *d_ptr = NULL;
+	if (dst == BANK_LO)
+	{	d_ptr = &BUF_LO[d_op];
+		buf_flag_lo[d_op] = true;
 	}
-
-	if (src_bank_y == UOP_BANK_EXT)
-	{	switch(src_operand_y)
-		{	case UOP_SRC_EXT_ZERO:	ptr_src_y = &X25519_ZERO; break;
-			case UOP_SRC_EXT_ONE:	ptr_src_y = &X25519_ONE; break;
-			case UOP_SRC_EXT_X:		ptr_src_y = &BANK_EXT_X; break;
-			case UOP_SRC_EXT_A24:	ptr_src_y = &X25519_A24; break;
-		}
+	if (dst == BANK_HI)
+	{	d_ptr = &BUF_HI[d_op];
+		buf_flag_hi[d_op] = true;
 	}
 
-	if (dst_bank_x != UOP_BANK_EXT) ptr_dst_x = &BANK_INT[dst_bank_x][dst_operand_x];
-	if (dst_bank_y != UOP_BANK_EXT) ptr_dst_y = &BANK_INT[dst_bank_y][dst_operand_y];
-
-	if (dst_bank_x == UOP_BANK_EXT)
-	{	switch(dst_operand_x)
-		{	case UOP_DST_EXT_Y:		ptr_dst_x = &BANK_EXT_Y; break;
-		}
-	}
-
-	if (dst_bank_y == UOP_BANK_EXT)
-	{	switch(dst_operand_y)
-		{	case UOP_DST_EXT_Y:		ptr_dst_y = &BANK_EXT_Y; break;
-		}
-	}
-
-	FPGA_BUFFER D;
-	fpga_modular_sub(ptr_src_x, ptr_src_y, &D, modulus);
-	fpga_multiword_copy(&D, ptr_dst_x);
-	fpga_multiword_copy(&D, ptr_dst_y);
-
-	if (dst_bank_x != UOP_BANK_EXT)	bank_flags[dst_bank_x][dst_operand_x] = true;
-	if (dst_bank_y != UOP_BANK_EXT)	bank_flags[dst_bank_y][dst_operand_y] = true;
+	fpga_multiword_copy(mem, d_ptr);
 }
 
 
 //------------------------------------------------------------------------------
-static void uop_mul		(UOP_BANK src_bank_x, UOP_SRC_OPERAND src_operand_x,
-						 UOP_BANK src_bank_y, UOP_SRC_OPERAND src_operand_y,
-						 UOP_BANK dst_bank_x, UOP_DST_OPERAND dst_operand_x,
-						 UOP_BANK dst_bank_y, UOP_DST_OPERAND dst_operand_y)
+static void uop_stor(UOP_BANK src, UOP_OPERAND s_op, FPGA_BUFFER *mem)
 //------------------------------------------------------------------------------
 {
-	if ((src_bank_x != UOP_BANK_EXT) && (src_bank_x == src_bank_y)) uop_fatal("ERROR: uop_move(): src_bank_x == src_bank_y!");
-	if ((src_bank_y != UOP_BANK_EXT) && (src_bank_y == src_bank_x)) uop_fatal("ERROR: uop_move(): src_bank_y == src_bank_x!");
-
-	if (dst_bank_x == dst_bank_y) uop_fatal("ERROR: uop_move(): dst_bank_x == dst_bank_y!");
-
-	if (src_bank_x == dst_bank_x) uop_fatal("ERROR: uop_move(): src_bank_x == dst_bank_x!");
-	if (src_bank_x == dst_bank_y) uop_fatal("ERROR: uop_move(): src_bank_x == dst_bank_y!");
-
-	if (src_bank_y == dst_bank_x) uop_fatal("ERROR: uop_move(): src_bank_y == dst_bank_x!");
-	if (src_bank_y == dst_bank_y) uop_fatal("ERROR: uop_move(): src_bank_y == dst_bank_y!");
-
-	FPGA_BUFFER *ptr_src_x, *ptr_dst_x;
-	FPGA_BUFFER *ptr_src_y, *ptr_dst_y;
-
-	if (src_bank_x != UOP_BANK_EXT) ptr_src_x = &BANK_INT[src_bank_x][src_operand_x];
-	if (src_bank_y != UOP_BANK_EXT) ptr_src_y = &BANK_INT[src_bank_y][src_operand_y];
-
-	if (src_bank_x == UOP_BANK_EXT)
-	{	switch(src_operand_x)
-		{	case UOP_SRC_EXT_ZERO:	ptr_src_x = &X25519_ZERO; break;
-			case UOP_SRC_EXT_ONE:	ptr_src_x = &X25519_ONE; break;
-			case UOP_SRC_EXT_X:		ptr_src_x = &BANK_EXT_X; break;
-			case UOP_SRC_EXT_A24:	ptr_src_x = &X25519_A24; break;
-		}
-	}
-
-	if (src_bank_y == UOP_BANK_EXT)
-	{	switch(src_operand_y)
-		{	case UOP_SRC_EXT_ZERO:	ptr_src_y = &X25519_ZERO; break;
-			case UOP_SRC_EXT_ONE:	ptr_src_y = &X25519_ONE; break;
-			case UOP_SRC_EXT_X:		ptr_src_y = &BANK_EXT_X; break;
-			case UOP_SRC_EXT_A24:	ptr_src_y = &X25519_A24; break;
-		}
-	}
-
-	if (dst_bank_x != UOP_BANK_EXT) ptr_dst_x = &BANK_INT[dst_bank_x][dst_operand_x];
-	if (dst_bank_y != UOP_BANK_EXT) ptr_dst_y = &BANK_INT[dst_bank_y][dst_operand_y];
-
-	if (dst_bank_x == UOP_BANK_EXT)
-	{	switch(dst_operand_x)
-		{	case UOP_DST_EXT_Y:		ptr_dst_x = &BANK_EXT_Y; break;
-		}
+	FPGA_BUFFER *s_ptr = NULL;
+	if (src == BANK_LO)
+	{	if (!buf_flag_lo[s_op])
+			uop_fatal("ERROR: uop_stor(): !buf_flag_lo[s_op]");
+		s_ptr = &BUF_LO[s_op];
+		buf_flag_lo[s_op] = true;
 	}
-
-	if (dst_bank_y == UOP_BANK_EXT)
-	{	switch(dst_operand_y)
-		{	case UOP_DST_EXT_Y:		ptr_dst_y = &BANK_EXT_Y; break;
-		}
+	if (src == BANK_HI)
+	{	if (!buf_flag_hi[s_op])
+			uop_fatal("ERROR: uop_stor(): !buf_flag_hi[s_op]");
+		s_ptr = &BUF_HI[s_op];
+		buf_flag_hi[s_op] = true;
 	}
 
-	FPGA_BUFFER P;
-	fpga_modular_mul(ptr_src_x, ptr_src_y, &P, &X25519_2P);
-	fpga_multiword_copy(&P, ptr_dst_x);
-	fpga_multiword_copy(&P, ptr_dst_y);
-
-	if (dst_bank_x != UOP_BANK_EXT)	bank_flags[dst_bank_x][dst_operand_x] = true;
-	if (dst_bank_y != UOP_BANK_EXT)	bank_flags[dst_bank_y][dst_operand_y] = true;
-}
-
-
-//------------------------------------------------------------------------------
-static void bank2buffer(UOP_BANK bank, UOP_DST_OPERAND operand, FPGA_BUFFER *buffer)
-//------------------------------------------------------------------------------
-{
-	if (bank == UOP_BANK_EXT) uop_fatal("ERROR: bank2buffer(): bank == UOP_BANK_EXT!");
-	if (operand == UOP_DST_EXT_Y) uop_fatal("ERROR: bank2buffer(): operand == UOP_DST_EXT_Y!");
-	if (!bank_flags[bank][operand])
-		uop_fatal("ERROR: bank2buffer(): !bank_flags[bank][operand]!");
-
-	fpga_multiword_copy(&BANK_INT[bank][operand], buffer);
-}
-
-
-//------------------------------------------------------------------------------
-static void buffer2bank(FPGA_BUFFER *buffer, UOP_BANK bank, UOP_SRC_OPERAND operand)
-//------------------------------------------------------------------------------
-{
-	if (bank == UOP_BANK_EXT) uop_fatal("ERROR: buffer2bank(): bank == UOP_BANK_EXT!");
-	if (operand == UOP_SRC_EXT_ZERO) uop_fatal("ERROR: buffer2bank(): operand == UOP_SRC_EXT_ZERO!");
-	if (operand == UOP_SRC_EXT_ONE) uop_fatal("ERROR: buffer2bank(): operand == UOP_SRC_EXT_ONE!");
-	if (operand == UOP_SRC_EXT_X) uop_fatal("ERROR: buffer2bank(): operand == UOP_SRC_EXT_X!");
-
-	fpga_multiword_copy(buffer, &BANK_INT[bank][operand]);
+	fpga_multiword_copy(s_ptr, mem);
 }
 
 



More information about the Commits mailing list