[Cryptech-Commits] [user/shatov/ecdsa_fpga_model] 05/06: Updated curve math layer to do multiplication using the Montgomery ladder method. Also added optional debugging output to help debug microcoded versions of double and add routines.

git at cryptech.is git at cryptech.is
Sun Apr 11 14:46:55 UTC 2021


This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/ecdsa_fpga_model.

commit 1e16303d718986e0e991444a7cdcab3c5c89b1f4
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Sun Apr 11 17:42:52 2021 +0300

    Updated curve math layer to do multiplication using the Montgomery ladder
    method. Also added optional debugging output to help debug microcoded versions
    of double and add routines.
---
 ecdsa_fpga_curve.h             |  70 +++---
 ecdsa_fpga_curve_abstract.cpp  | 326 +++++++++++++-----------
 ecdsa_fpga_curve_microcode.cpp | 547 ++++++++++++++++++++---------------------
 3 files changed, 488 insertions(+), 455 deletions(-)

diff --git a/ecdsa_fpga_curve.h b/ecdsa_fpga_curve.h
index 00448eb..e9f2fe6 100644
--- a/ecdsa_fpga_curve.h
+++ b/ecdsa_fpga_curve.h
@@ -6,7 +6,7 @@
 //
 // Authors: Pavel Shatov
 //
-// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+// Copyright (c) 2015-2016, 2018, 2021 NORDUnet A/S
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
@@ -139,14 +139,33 @@ extern FPGA_BUFFER ECDSA_N;
 #ifdef USE_MICROCODE
 
 #define fpga_curve_base_scalar_multiply fpga_curve_base_scalar_multiply_microcode
-#define fpga_curve_add_jacobian         fpga_curve_add_jacobian_microcode_wrapper
-#define fpga_curve_double_jacobian      fpga_curve_double_jacobian_microcode_wrapper
+
+#define fpga_curve_add_jacobian_2_shim  fpga_curve_add_jacobian_microcode_2_wrapper
+#define fpga_curve_double_jacobian_shim fpga_curve_double_jacobian_microcode_wrapper
+
+void fpga_curve_double_jacobian_microcode_wrapper (const FPGA_BUFFER *px,
+                                                   const FPGA_BUFFER *py,
+                                                   const FPGA_BUFFER *pz,
+                                                         FPGA_BUFFER *rx,
+                                                         FPGA_BUFFER *ry,
+                                                         FPGA_BUFFER *rz);
+
+void fpga_curve_add_jacobian_microcode_2_wrapper(const FPGA_BUFFER *px,
+                                                 const FPGA_BUFFER *py,
+                                                 const FPGA_BUFFER *pz,
+                                                 const FPGA_BUFFER *qx,
+                                                 const FPGA_BUFFER *qy,
+                                                 const FPGA_BUFFER *qz,
+                                                       FPGA_BUFFER *rx,
+                                                       FPGA_BUFFER *ry,
+                                                       FPGA_BUFFER *rz);
 
 #else
 
 #define fpga_curve_base_scalar_multiply fpga_curve_base_scalar_multiply_abstract
-#define fpga_curve_add_jacobian         fpga_curve_add_jacobian_abstract
-#define fpga_curve_double_jacobian      fpga_curve_double_jacobian_abstract
+
+#define fpga_curve_add_jacobian_2_shim  fpga_curve_add_jacobian_abstract_2
+#define fpga_curve_double_jacobian_shim fpga_curve_double_jacobian_abstract
 
 #endif
 
@@ -156,20 +175,23 @@ extern FPGA_BUFFER ECDSA_N;
 //------------------------------------------------------------------------------
 void fpga_curve_init ();
 
-void fpga_curve_base_scalar_multiply_abstract  (const FPGA_BUFFER *k,
-                                                      FPGA_BUFFER *qx,
-                                                      FPGA_BUFFER *qy);
+void fpga_curve_base_scalar_multiply_abstract (const FPGA_BUFFER *k,
+                                                     FPGA_BUFFER *qx,
+                                                     FPGA_BUFFER *qy);
 
 void fpga_curve_base_scalar_multiply_microcode (const FPGA_BUFFER *k,
                                                       FPGA_BUFFER *qx,
                                                       FPGA_BUFFER *qy);
 
-void fpga_curve_add_jacobian_abstract    (const FPGA_BUFFER *px,
-                                          const FPGA_BUFFER *py,
-                                          const FPGA_BUFFER *pz,
-                                                FPGA_BUFFER *rx,
-                                                FPGA_BUFFER *ry,
-                                                FPGA_BUFFER *rz);
+void fpga_curve_add_jacobian_abstract_2 (const FPGA_BUFFER *px,
+                                         const FPGA_BUFFER *py,
+                                         const FPGA_BUFFER *pz,
+                                         const FPGA_BUFFER *qx,
+                                         const FPGA_BUFFER *qy,
+                                         const FPGA_BUFFER *qz,
+                                               FPGA_BUFFER *rx,
+                                               FPGA_BUFFER *ry,
+                                               FPGA_BUFFER *rz);
 
 void fpga_curve_double_jacobian_abstract (const FPGA_BUFFER *px,
                                           const FPGA_BUFFER *py,
@@ -178,24 +200,10 @@ void fpga_curve_double_jacobian_abstract (const FPGA_BUFFER *px,
                                                 FPGA_BUFFER *ry,
                                                 FPGA_BUFFER *rz);
 
-void fpga_curve_add_jacobian_microcode    ();
-
-void fpga_curve_double_jacobian_microcode ();
-
-void fpga_curve_add_jacobian_microcode_wrapper    (const FPGA_BUFFER *px,
-                                                   const FPGA_BUFFER *py,
-                                                   const FPGA_BUFFER *pz,
-                                                         FPGA_BUFFER *rx,
-                                                         FPGA_BUFFER *ry,
-                                                         FPGA_BUFFER *rz);
+void fpga_curve_add_jacobian_microcode_2    ();
 
-
-void fpga_curve_double_jacobian_microcode_wrapper (const FPGA_BUFFER *px,
-                                                   const FPGA_BUFFER *py,
-                                                   const FPGA_BUFFER *pz,
-                                                         FPGA_BUFFER *rx,
-                                                         FPGA_BUFFER *ry,
-                                                         FPGA_BUFFER *rz);
+void fpga_curve_double_jacobian_microcode_r0    ();
+void fpga_curve_double_jacobian_microcode_r1    ();
 
 
 //------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_curve_abstract.cpp b/ecdsa_fpga_curve_abstract.cpp
index 5510ac1..2d25cfc 100644
--- a/ecdsa_fpga_curve_abstract.cpp
+++ b/ecdsa_fpga_curve_abstract.cpp
@@ -6,7 +6,7 @@
 //
 // Authors: Pavel Shatov
 //
-// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+// Copyright (c) 2015-2016, 2018, 2021 NORDUnet A/S
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
@@ -79,14 +79,7 @@ void fpga_curve_init()
 // Q(qx,qy) = k * G(px,py)
 //
 // Note, that Q is supposed to be in affine coordinates. Multiplication is done
-// using the double-and-add algorithm 3.27 from "Guide to Elliptic Curve
-// Cryptography".
-//
-// WARNING: Though this procedure always does the addition step, it only
-// updates the result when current bit of k is set. It does not take any
-// active measures to keep run-time constant. The main purpose of this model
-// is to help debug Verilog code for FPGA, so *DO NOT* use it anywhere near
-// production!
+// using the Montgomery ladder method.
 //
 //------------------------------------------------------------------------------
 void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
@@ -94,42 +87,91 @@ void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER
 {
     int word_count, bit_count;  // counters
 
-    FPGA_BUFFER rx, ry, rz;     // intermediate result
-    FPGA_BUFFER tx, ty, tz;     // temporary variable
+    FPGA_BUFFER r0x, r0y, r0z;  // intermediate result
+    FPGA_BUFFER r1x, r1y, r1z;  // intermediate result
+    FPGA_BUFFER sx, sy, sz;     // temporary variable
+	FPGA_BUFFER tx, ty, tz;     // temporary variable
+
+        /* set initial value of R0 to point at infinity, R1 to the base point */
+    fpga_multiword_copy(&ECDSA_ONE,  &r0x);
+    fpga_multiword_copy(&ECDSA_ONE,  &r0y);
+    fpga_multiword_copy(&ECDSA_ZERO, &r0z);
 
-        /* set initial value of R to point at infinity */
-    fpga_multiword_copy(&ECDSA_ONE,  &rx);
-    fpga_multiword_copy(&ECDSA_ONE,  &ry);
-    fpga_multiword_copy(&ECDSA_ZERO, &rz);
+    fpga_multiword_copy(&ECDSA_GX,  &r1x);
+    fpga_multiword_copy(&ECDSA_GY,  &r1y);
+    fpga_multiword_copy(&ECDSA_ONE, &r1z);
+
+        /* handy vars */
+    FPGA_WORD k_word_shifted;
+    bool k_bit;
 
         /* process bits of k left-to-right */
     for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--)
         for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
         {
-                /* calculate T = 2 * R */
-            fpga_curve_double_jacobian_abstract(&rx, &ry, &rz, &tx, &ty, &tz);
-
-                /* always calculate R = T + P for constant-time */
-            fpga_curve_add_jacobian_abstract(&tx, &ty, &tz, &rx, &ry, &rz);
-
-                /* revert to the value of T before addition if the current bit of k is not set */
-            if (!((k->words[word_count-1] >> (bit_count-1)) & 1))
-            {   fpga_multiword_copy(&tx, &rx);
-                fpga_multiword_copy(&ty, &ry);
-                fpga_multiword_copy(&tz, &rz);
-            }
-
+            k_word_shifted = k->words[word_count-1] >> (bit_count-1);
+			k_bit = (k_word_shifted & 1) == 1;
+
+#ifdef DUMP_CYCLE_STATES
+            dump_cycle_header(word_count, bit_count, k_bit);
+#endif
+
+                /* calculate S = R0 + R */
+            fpga_curve_add_jacobian_abstract_2(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z, &sx, &sy, &sz);
+
+                /* calculate T = 2 * (R0 | R1) */
+			if (!k_bit)
+				fpga_curve_double_jacobian_abstract(&r0x, &r0y, &r0z, &tx, &ty, &tz);
+			else
+				fpga_curve_double_jacobian_abstract(&r1x, &r1y, &r1z, &tx, &ty, &tz);
+
+                //
+                // dump cycle state
+                //
+#ifdef DUMP_CYCLE_STATES
+            dump_cycle_state(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z,
+                             &sx,  &sy,  &sz,  &tx,  &ty,  &tz);
+#endif
+
+				/* now update working variables */
+			if (!k_bit)
+			{	fpga_multiword_copy(&tx, &r0x);
+				fpga_multiword_copy(&ty, &r0y);
+				fpga_multiword_copy(&tz, &r0z);
+
+				fpga_multiword_copy(&sx, &r1x);
+				fpga_multiword_copy(&sy, &r1y);
+				fpga_multiword_copy(&sz, &r1z);
+			}
+			else
+			{	fpga_multiword_copy(&tx, &r1x);
+				fpga_multiword_copy(&ty, &r1y);
+				fpga_multiword_copy(&tz, &r1z);
+
+				fpga_multiword_copy(&sx, &r0x);
+				fpga_multiword_copy(&sy, &r0y);
+				fpga_multiword_copy(&sz, &r0z);
+			}
         }
 
+		//
+		// we now need to convert the point to affine coordinates
+		//
     FPGA_BUFFER a2, a3; 
 
-	fpga_modular_inv23(&rz, &a2, &a3);
+#ifdef DUMP_UOP_OUTPUTS
+    _DUMP_MODULAR_RESULTS = true;
+#endif
+
+	fpga_modular_inv23(&r0z, &a2, &a3);
+
+    fpga_modular_mul(&r0x, &a2, qx);      // qx = px * (pz^-1)^2 (mod q)
+    fpga_modular_mul(&r0y, &a3, qy);      // qy = py * (pz^-1)^3 (mod q)
 
-    fpga_modular_mul(&rx, &a2, qx);      // qx = px * (pz^-1)^2 (mod q)
-    fpga_modular_mul(&ry, &a3, qy);      // qy = py * (pz^-1)^3 (mod q)
+    _DUMP_MODULAR_RESULTS = false;
 
         // check, that rz is non-zero (not point at infinity)
-    bool rz_is_zero = fpga_multiword_is_zero(&rz);
+    bool rz_is_zero = fpga_multiword_is_zero(&r0z);
 
         // handle special case (result is point at infinity)
     if (rz_is_zero)
@@ -154,21 +196,13 @@ void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER
 // faster, than multiplication.
 //
 // Note, that this routine also handles one special case, namely when P is at
-// infinity.
+// infinity. No actual extra "handling" is necessary, since when pz is zero,
+// rz will also be zero (and that's what the "at infinity" check takes into
+// account).
 //
 // Instead of actual modular division, multiplication by pre-computed constant
 // (2^-1 mod q) is done.
 //
-// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
-// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
-// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
-// then fpga_modular_mul(pz, &t1, &t1) instead.
-//
-// WARNING: Though this procedure always does doubling steps, it does not take
-// any active measures to keep run-time constant. The main purpose of this
-// model is to help debug Verilog code for FPGA, so *DO NOT* use is anywhere
-// near production!
-//
 //------------------------------------------------------------------------------
 void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px,
                                          const FPGA_BUFFER *py,
@@ -178,41 +212,32 @@ void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px,
                                                FPGA_BUFFER *rz)
 //------------------------------------------------------------------------------
 {
-    FPGA_BUFFER t1, t2, t3; // temporary variables
-
-        // check, whether P is at infinity
-    bool pz_is_zero = fpga_multiword_is_zero(pz);
-
-    /*  2. */ fpga_multiword_copy(pz,  &t1);
-              fpga_modular_mul(pz,  &t1,          &t1);
-    /*  3. */ fpga_modular_sub(px,  &t1,          &t2);
-    /*  4. */ fpga_modular_add(px,  &t1,          &t1);
-    /*  5. */ fpga_modular_mul(&t1, &t2,          &t2);
-    /*  6. */ fpga_modular_add(&t2, &t2,          &t1);
-    /*     */ fpga_modular_add(&t1, &t2,          &t2);
-    /*  7. */ fpga_modular_add(py,  py,           ry);
-    /*  8. */ fpga_modular_mul(pz,  ry,           rz);
-    /*  9. */ fpga_multiword_copy(ry,  &t1);
-              fpga_multiword_copy(ry,  &t3);
-              fpga_modular_mul(&t1, &t3,          ry);
-    /* 10. */ fpga_modular_mul(px,  ry,           &t3);
-    /* 11. */ fpga_multiword_copy(ry,  &t1);
-              fpga_modular_mul(ry,  &t1,          &t1);
-    /* 12. */ fpga_modular_mul(&t1, &ECDSA_DELTA, ry);
-    /* 13. */ fpga_multiword_copy(&t2, &t1);
-              fpga_modular_mul(&t1, &t2,          rx);
-    /* 14. */ fpga_modular_add(&t3, &t3,          &t1);
-    /* 15. */ fpga_modular_sub(rx,  &t1,          rx);
-    /* 16. */ fpga_modular_sub(&t3, rx,           &t1); 
-    /* 17. */ fpga_modular_mul(&t1, &t2,          &t1);
-    /* 18. */ fpga_modular_sub(&t1, ry,           ry);  
-
-        // handle special case (input point is at infinity)
-    if (pz_is_zero)
-    {   fpga_multiword_copy(&ECDSA_ONE,  rx);
-        fpga_multiword_copy(&ECDSA_ONE,  ry);
-        fpga_multiword_copy(&ECDSA_ZERO, rz);
-    }
+    FPGA_BUFFER t1, t2, t3, t4, t5; // temporary variables
+
+#ifdef DUMP_UOP_OUTPUTS
+    _DUMP_MODULAR_RESULTS = true;
+#endif
+
+    fpga_modular_mul(pz,  pz,           &t1);
+    fpga_modular_sub(px,  &t1,          &t2);
+    fpga_modular_add(px,  &t1,          &t3);
+    fpga_modular_mul(&t3, &t2,          &t4);
+    fpga_modular_add(&t4, &t4,          &t1);
+    fpga_modular_add(&t1, &t4,          &t2);
+    fpga_modular_add(py,  py,           ry);
+    fpga_modular_mul(pz,  ry,           rz);
+	fpga_modular_mul(ry,  ry,           &t1);
+    fpga_modular_mul(px,  &t1,          &t3);
+	fpga_modular_mul(&t1, &t1,          &t4);
+	fpga_modular_mul(&t4, &ECDSA_DELTA, &t5);
+	fpga_modular_mul(&t2, &t2,          &t4);
+    fpga_modular_add(&t3, &t3,          &t1);
+    fpga_modular_sub(&t4, &t1,          rx); 
+    fpga_modular_sub(&t3, rx,           &t1);
+    fpga_modular_mul(&t1, &t2,          &t3);
+    fpga_modular_sub(&t3, &t5,          ry);
+
+    _DUMP_MODULAR_RESULTS = false;
 }
 
 
@@ -220,89 +245,94 @@ void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px,
 //
 // Elliptic curve point addition routine.
 //
-// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy)
+// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy,qz)
 //
-// Note, that P(px, py, pz) is supposed to be in projective Jacobian
-// coordinates, while Q(qx,qy) is supposed to be in affine coordinates,
-// R(rx, ry, rz) will be in projective Jacobian coordinates. Moreover, in this
-// particular implementation Q is always the base point G.
+// Note, that P(px, py, pz) and Q(qx, qy, qz) are supposed to be in projective
+// Jacobian coordinates, R(rx, ry, rz) will be in projective Jacobian
+// coordinates too.
 //
-// This routine implements algorithm 3.22 from "Guide to Elliptic Curve
-// Cryptography". Differences from the original algorithm:
+// This routine implements the Point Addition algorithm from
+// https://en.wikibooks.org/wiki/Cryptography/Prime_Curve/Jacobian_Coordinates
 // 
-// 1) Step 1. is omitted, because point Q is always the base point, which is
-//    not at infinity by definition.
-//
-// 2) Step 9.1 just returns the pre-computed double of the base point instead
-// of actually doubling it.
-//
-// Note, that this routine also handles three special cases:
-//
-// 1) P is at infinity
-// 2) P == Q
-// 3) P == -Q
-//
-// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
-// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
-// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
-// then fpga_modular_mul(pz, &t1, &t1) instead.
+// Since the routine is means to be used with Montgomery ladder, the invariant
+// R1 - R0 = G means, that the two special cases P == Q and P == -Q can never
+// happen and the checks are redundant. The checks for P === O and Q == O are
+// necessary, however. Note, that P and Q can't be at infinity at the same time
+// though.
 //
 // WARNING: This procedure does not take any active measures to keep run-time
 // constant. The main purpose of this model is to help debug Verilog code for
 // FPGA, so *DO NOT* use is anywhere near production!
 //
 //------------------------------------------------------------------------------
-void fpga_curve_add_jacobian_abstract(const FPGA_BUFFER *px,
-                                      const FPGA_BUFFER *py,
-                                      const FPGA_BUFFER *pz,
-                                            FPGA_BUFFER *rx,
-                                            FPGA_BUFFER *ry,
-                                            FPGA_BUFFER *rz)
+void fpga_curve_add_jacobian_abstract_2(const FPGA_BUFFER *px,
+                                        const FPGA_BUFFER *py,
+                                        const FPGA_BUFFER *pz,
+                                        const FPGA_BUFFER *qx,
+                                        const FPGA_BUFFER *qy,
+                                        const FPGA_BUFFER *qz,
+                                              FPGA_BUFFER *rx,
+                                              FPGA_BUFFER *ry,
+                                              FPGA_BUFFER *rz)
 //------------------------------------------------------------------------------
 {
-    FPGA_BUFFER t1, t2, t3, t4;     // temporary variables
+	bool pz_is_zero = fpga_multiword_is_zero(pz);
+	bool qz_is_zero = fpga_multiword_is_zero(qz);
+
+	FPGA_BUFFER t1, t2, t3, t4, t5, t6, t7, t8;
+
+#ifdef DUMP_UOP_OUTPUTS
+    _DUMP_MODULAR_RESULTS = true;
+#endif
+
+	fpga_modular_mul(pz, pz, &t1);		// pz2 = pz * pz (pz squared)
+	fpga_modular_mul(qz, qz, &t2);		// qz2 = qz * qz (qz squared)
+
+	fpga_modular_mul(pz, &t1, &t3);		// pz3 = pz * pz2 (pz cubed)
+	fpga_modular_mul(qz, &t2, &t4);     // qz3 = qz * qz2 (qz cubed)
+    
+	fpga_modular_mul(px, &t2, &t5);	    // pxz = px * qz2 (px z-adjusted)
+	fpga_modular_mul(qx, &t1, &t2);     // qxz = qx * pz2 (qx z-adjusted)
+
+	fpga_modular_mul(py, &t4, &t6);		// pyz = py * qz3 (py z-adjusted)
+	fpga_modular_mul(qy, &t3, &t4);		// qyz = qy * pz3 (qy z-adjusted)
+
+	fpga_modular_sub(&t2, &t5, &t7);	// dqpx = qxz - pxz (x-coordinate delta)
+	fpga_modular_sub(&t4, &t6, &t8);	// dqpy = qyz - pyz (y-coordinate delta) 
+ 
+	fpga_modular_mul(pz,  qz,  &t1);	// pqz = pz * qz
+	fpga_modular_mul(&t7, &t1, rz);		// rz = pqz * qdpx
+    
+	fpga_modular_mul(&t8, &t8, &t2);	// dqpy2 = dqpy * dqpy
+	fpga_modular_mul(&t7, &t7, &t3);	// dqpx2 = dqpx * dqpx
+	fpga_modular_mul(&t7, &t3, &t4);	// dqpx3 = dqpx * dqpx2
+    
+	fpga_modular_sub(&t2, &t4, &t1);	// t1 = dqpy2 - dqpx3
+	fpga_modular_mul(&t5, &t3, &t2);	// t2 = pxz * dqpx2
+	fpga_modular_add(&t2, &t2, &t3);	// t3 = 2 * t2 (= t2 + t2, which is faster)
+	fpga_modular_sub(&t1, &t3, rx);		// rx = t1 - t3
+    
+	fpga_modular_sub(&t2, rx,  &t1);	// t1 = t2 - rx
+    fpga_modular_mul(&t1, &t8, &t2);	// t2 = t1 * dqpy
+	fpga_modular_mul(&t6, &t4, &t3);	// t3 = pyz * dqpx3
+	fpga_modular_sub(&t2, &t3, ry);		// ry = t2 - t3
     
-    bool pz_is_zero = fpga_multiword_is_zero(pz);       // Step 2.
-
-    /*  3. */ fpga_multiword_copy(pz,  &t1);
-              fpga_modular_mul(pz,  &t1,        &t1);
-    /*  4. */ fpga_modular_mul(pz,  &t1,        &t2);
-    /*  5. */ fpga_modular_mul(&t1, &ECDSA_GX, &t1);
-    /*  6. */ fpga_modular_mul(&t2, &ECDSA_GY, &t2);
-    /*  7. */ fpga_modular_sub(&t1, px,         &t1);
-    /*  8. */ fpga_modular_sub(&t2, py,         &t2);
-
-    bool t1_is_zero = fpga_multiword_is_zero(&t1);      // | Step 9.
-    bool t2_is_zero = fpga_multiword_is_zero(&t2);      // |
-
-    /* 10. */ fpga_modular_mul(pz,  &t1,        rz);
-    /* 11. */ fpga_multiword_copy(&t1, &t3);
-              fpga_modular_mul(&t1, &t3,        &t3);
-    /* 12. */ fpga_modular_mul(&t1, &t3,        &t4);
-    /* 13. */ fpga_modular_mul(px,  &t3,        &t3);
-    /* 14. */ fpga_modular_add(&t3, &t3,        &t1);
-    /* 15. */ fpga_multiword_copy(&t2, rx);
-              fpga_modular_mul(rx,  &t2,        rx);
-    /* 16. */ fpga_modular_sub(rx,  &t1,        rx);
-    /* 17. */ fpga_modular_sub(rx,  &t4,        rx);
-    /* 18. */ fpga_modular_sub(&t3, rx,         &t3);
-    /* 19. */ fpga_modular_mul(&t2, &t3,        &t3);
-    /* 20. */ fpga_modular_mul(py,  &t4,        &t4);
-    /* 21. */ fpga_modular_sub(&t3, &t4,        ry);
-
-        //
-        // final selection
-        //
-    if (pz_is_zero) // P at infinity ?
-    {   fpga_multiword_copy(&ECDSA_GX, rx);
-        fpga_multiword_copy(&ECDSA_GY, ry);
-        fpga_multiword_copy(&ECDSA_ONE, rz);
+    _DUMP_MODULAR_RESULTS = false;
+
+	// P == O
+    if (pz_is_zero)
+    {   fpga_multiword_copy(qx, rx);
+        fpga_multiword_copy(qy, ry);
+        fpga_multiword_copy(qz, rz);
+		return;
     }
-    else if (t1_is_zero) // same x for P and Q ?
-    {
-        fpga_multiword_copy(t2_is_zero ? &ECDSA_HX : &ECDSA_ONE,  rx);  // | same y ? (P==Q => R=2*G) : (P==-Q => R=O)
-        fpga_multiword_copy(t2_is_zero ? &ECDSA_HY : &ECDSA_ONE,  ry);  // |
-        fpga_multiword_copy(t2_is_zero ? &ECDSA_ONE : &ECDSA_ZERO, rz); // |
+
+	// Q == O
+    if (qz_is_zero)
+    {   fpga_multiword_copy(px, rx);
+        fpga_multiword_copy(py, ry);
+        fpga_multiword_copy(pz, rz);
+		return;
     }
 }
 
diff --git a/ecdsa_fpga_curve_microcode.cpp b/ecdsa_fpga_curve_microcode.cpp
index 553498c..128e087 100644
--- a/ecdsa_fpga_curve_microcode.cpp
+++ b/ecdsa_fpga_curve_microcode.cpp
@@ -6,7 +6,7 @@
 //
 // Authors: Pavel Shatov
 //
-// Copyright (c) 2018 NORDUnet A/S
+// Copyright (c) 2018, 2021 NORDUnet A/S
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
@@ -51,205 +51,186 @@
 
 //------------------------------------------------------------------------------
 //
-// Doubles the point stored in CYCLE_R* and stores the result in CYCLE_S*.
+// Doubles the point stored in CYCLE_R0* and stores the result in CYCLE_T*.
 //
 //------------------------------------------------------------------------------
-void fpga_curve_double_jacobian_microcode()
+void fpga_curve_double_jacobian_microcode_r0()
 //------------------------------------------------------------------------------
 {
-    // fpga_modular_mul(RZ, RZ,    RZ2 ); //  2.  RZ2  = RZ * RZ
-    // fpga_modular_sub(RX, RZ2,   T1  ); //  3.  T1   = RX - RZ2
-    // fpga_modular_add(RX, RZ2,   T2  ); //  4.  T2   = RX + RZ2
-    // fpga_modular_mul(T1, T2,    T3  ); //  5.  T3   = T1 * T2
-    // fpga_modular_add(T3, T3,    T4  ); //  6a. T4   = T3 + T3
-    // fpga_modular_add(T3, T4,    A   ); //  6b. A    = T3 + T4
-    // fpga_modular_add(RY, RY,    B   ); //  7.  B    = RY + RY
-    // fpga_modular_mul(B,  RZ,    SZ  ); //  8.  SZ   = B  * RZ [output]
-    // fpga_modular_mul(B,  B,     C   ); //  9.  C    = B  * B
-    // fpga_modular_mul(C,  RX,    D   ); // 10.  D    = C  * RX
-    // fpga_modular_mul(C,  C,     C2  ); // 11.  C2   = C  * C
-    // fpga_modular_mul(C2, DELTA, C2_2); // 12.  C2_2 = C  / 2
-    // fpga_modular_mul(A,  A,     A2  ); // 13.  A2   = A  * A
-    // fpga_modular_add(D,  D,     T1  ); // 14.  T1   = D  + D
-    // fpga_modular_sub(A2, T1,    SX  ); // 15.  SX   = A2 - T1 [output]
-    // fpga_modular_sub(D,  SX,    T1  ); // 16.  T1   = D  - SX
-    // fpga_modular_mul(A , T1,    T2  ); // 17.  T2   = A  * T1
-    // fpga_modular_sub(T2, C2_2,  SY  ); // 18.  SY   = T2 - C2_2 [output]
+    /* BEGIN_MICROCODE: CYCLE_DOUBLE_R0 */
+
+    uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R0Z,   BANK_HI, CYCLE_T1);		
+    uop_calc(SUB, BANK_HI, CYCLE_R0X, CYCLE_T1,    BANK_LO, CYCLE_T2);
+	uop_calc(ADD, BANK_HI, CYCLE_R0X, CYCLE_T1,    BANK_LO, CYCLE_T3);
+	uop_calc(MUL, BANK_LO, CYCLE_T3,  CYCLE_T2,    BANK_HI, CYCLE_T4);
+	uop_calc(ADD, BANK_HI, CYCLE_T4,  CYCLE_T4,    BANK_LO, CYCLE_T1);
+	
+	uop_move(     BANK_HI, CYCLE_T4,               BANK_LO, CYCLE_T4);
+	
+	uop_calc(ADD, BANK_LO, CYCLE_T1,  CYCLE_T4,    BANK_HI, CYCLE_T2);
+	uop_calc(ADD, BANK_HI, CYCLE_R0Y, CYCLE_R0Y,   BANK_LO, CYCLE_TY);
+	uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_TY,    BANK_HI, CYCLE_TZ);
+	uop_calc(MUL, BANK_LO, CYCLE_TY,  CYCLE_TY,    BANK_HI, CYCLE_T1);
+	uop_calc(MUL, BANK_HI, CYCLE_R0X, CYCLE_T1,    BANK_LO, CYCLE_T3);
+	uop_calc(MUL, BANK_HI, CYCLE_T1,  CYCLE_T1,    BANK_LO, CYCLE_T4);
+	uop_calc(MUL, BANK_LO, CYCLE_T4,  CONST_DELTA, BANK_HI, CYCLE_T5);
+	uop_calc(MUL, BANK_HI, CYCLE_T2,  CYCLE_T2,    BANK_LO, CYCLE_T4);
+	uop_calc(ADD, BANK_LO, CYCLE_T3,  CYCLE_T3,    BANK_HI, CYCLE_T1);
+
+	uop_move(     BANK_LO, CYCLE_T4,               BANK_HI, CYCLE_T4);
+
+	uop_calc(SUB, BANK_HI, CYCLE_T4,  CYCLE_T1,    BANK_LO, CYCLE_TX);
+	uop_calc(SUB, BANK_LO, CYCLE_T3,  CYCLE_TX,    BANK_HI, CYCLE_T1);
+	uop_calc(MUL, BANK_HI, CYCLE_T1,  CYCLE_T2,    BANK_LO, CYCLE_T3);
+
+	uop_move(     BANK_LO, CYCLE_T3,               BANK_HI, CYCLE_T3);
+	
+	uop_calc(SUB, BANK_HI, CYCLE_T3,  CYCLE_T5,    BANK_LO, CYCLE_TY);
 
-    /* BEGIN_MICROCODE: CYCLE_DOUBLE */
-
-    FPGA_BUFFER TEMP;
-
-    uop_calc(MUL, BANK_LO, CYCLE_RZ, CYCLE_RZ,    BANK_HI,  CYCLE_Z2);
-    uop_stor(BANK_HI, CYCLE_Z2, &TEMP); print_fpga_buffer("CYCLE_Z2 = ", &TEMP);
-
-    uop_calc(SUB, BANK_HI, CYCLE_RX, CYCLE_Z2,    BANK_LO,  CYCLE_T1);
-    uop_stor(BANK_LO, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP);
-
-    uop_calc(ADD, BANK_HI, CYCLE_RX, CYCLE_Z2,    BANK_LO,  CYCLE_T2);
-    uop_stor(BANK_LO, CYCLE_T2, &TEMP); print_fpga_buffer("CYCLE_T2 = ", &TEMP);
+    /* END_MICROCODE */
+}
 
-    uop_calc(MUL, BANK_LO, CYCLE_T1, CYCLE_T2,    BANK_HI,  CYCLE_T3);
-    uop_stor(BANK_HI, CYCLE_T3, &TEMP); print_fpga_buffer("CYCLE_T3 = ", &TEMP);
 
-    uop_calc(ADD, BANK_HI, CYCLE_T3, CYCLE_T3,    BANK_LO,  CYCLE_T4);
-    uop_stor(BANK_LO, CYCLE_T4, &TEMP); print_fpga_buffer("CYCLE_T4 = ", &TEMP);
+//------------------------------------------------------------------------------
+//
+// Doubles the point stored in CYCLE_R1* and stores the result in CYCLE_T*.
+//
+//------------------------------------------------------------------------------
+void fpga_curve_double_jacobian_microcode_r1()
+//------------------------------------------------------------------------------
+{
+    /* BEGIN_MICROCODE: CYCLE_DOUBLE_R1 */
+
+	uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_R1Z,   BANK_HI, CYCLE_T1);		
+	uop_calc(SUB, BANK_HI, CYCLE_R1X, CYCLE_T1,    BANK_LO, CYCLE_T2);
+	uop_calc(ADD, BANK_HI, CYCLE_R1X, CYCLE_T1,    BANK_LO, CYCLE_T3);
+	uop_calc(MUL, BANK_LO, CYCLE_T3,  CYCLE_T2,    BANK_HI, CYCLE_T4);
+	uop_calc(ADD, BANK_HI, CYCLE_T4,  CYCLE_T4,    BANK_LO, CYCLE_T1);
+	
+	uop_move(     BANK_HI, CYCLE_T4,               BANK_LO, CYCLE_T4);
+	
+	uop_calc(ADD, BANK_LO, CYCLE_T1,  CYCLE_T4,    BANK_HI, CYCLE_T2);
+	uop_calc(ADD, BANK_HI, CYCLE_R1Y, CYCLE_R1Y,   BANK_LO, CYCLE_TY);
+	uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_TY,    BANK_HI, CYCLE_TZ);
+	uop_calc(MUL, BANK_LO, CYCLE_TY,  CYCLE_TY,    BANK_HI, CYCLE_T1);
+	uop_calc(MUL, BANK_HI, CYCLE_R1X, CYCLE_T1,    BANK_LO, CYCLE_T3);
+	uop_calc(MUL, BANK_HI, CYCLE_T1,  CYCLE_T1,    BANK_LO, CYCLE_T4);
+	uop_calc(MUL, BANK_LO, CYCLE_T4,  CONST_DELTA, BANK_HI, CYCLE_T5);
+	uop_calc(MUL, BANK_HI, CYCLE_T2,  CYCLE_T2,    BANK_LO, CYCLE_T4);
+	uop_calc(ADD, BANK_LO, CYCLE_T3,  CYCLE_T3,    BANK_HI, CYCLE_T1);
+
+	uop_move(     BANK_LO, CYCLE_T4,               BANK_HI, CYCLE_T4);
+
+	uop_calc(SUB, BANK_HI, CYCLE_T4,  CYCLE_T1,    BANK_LO, CYCLE_TX);
+	uop_calc(SUB, BANK_LO, CYCLE_T3,  CYCLE_TX,    BANK_HI, CYCLE_T1);
+	uop_calc(MUL, BANK_HI, CYCLE_T1,  CYCLE_T2,    BANK_LO, CYCLE_T3);
+
+	uop_move(     BANK_LO, CYCLE_T3,               BANK_HI, CYCLE_T3);
+	
+	uop_calc(SUB, BANK_HI, CYCLE_T3,  CYCLE_T5,    BANK_LO, CYCLE_TY);
 
-    uop_move(     BANK_LO, CYCLE_T4, BANK_HI,     CYCLE_T4);
+    /* END_MICROCODE */
+}
 
-    uop_calc(ADD, BANK_HI, CYCLE_T3, CYCLE_T4,    BANK_LO,  CYCLE_A);
-    uop_stor(BANK_LO, CYCLE_A, &TEMP); print_fpga_buffer("CYCLE_A = ", &TEMP);
 
-    uop_calc(ADD, BANK_HI, CYCLE_RY, CYCLE_RY,    BANK_LO,  CYCLE_B);
-    uop_stor(BANK_LO, CYCLE_B, &TEMP); print_fpga_buffer("CYCLE_B = ", &TEMP);
+//------------------------------------------------------------------------------
+//
+// Adds the points stored in CYCLE_R0|1 and stores the result in CYCLE_S.
+//
+//------------------------------------------------------------------------------
+void fpga_curve_add_jacobian_microcode_2()
+{
 
-    uop_calc(MUL, BANK_LO, CYCLE_B,  CYCLE_RZ,    BANK_HI,  CYCLE_SZ);
-    uop_stor(BANK_HI, CYCLE_SZ, &TEMP); print_fpga_buffer("CYCLE_SZ = ", &TEMP);
+    /* BEGIN_MICROCODE: CYCLE_ADD */
 
-    uop_calc(MUL, BANK_LO, CYCLE_B,  CYCLE_B,     BANK_HI,  CYCLE_C);
-    uop_stor(BANK_HI, CYCLE_C, &TEMP); print_fpga_buffer("CYCLE_C = ", &TEMP);
+	uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R0Z, BANK_HI, CYCLE_T1);
+	uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_R1Z, BANK_HI, CYCLE_T2);
+	
+	uop_move(     BANK_HI, CYCLE_T1,             BANK_LO, CYCLE_T1);
+	uop_move(     BANK_HI, CYCLE_T2,             BANK_LO, CYCLE_T2);
 
-    uop_calc(MUL, BANK_HI, CYCLE_C,  CYCLE_RX,    BANK_LO,  CYCLE_D);
-    uop_stor(BANK_LO, CYCLE_D, &TEMP); print_fpga_buffer("CYCLE_D = ", &TEMP);
+	uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_T1,  BANK_HI, CYCLE_T3);
+	uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_T2,  BANK_HI, CYCLE_T4);
 
-    uop_calc(MUL, BANK_HI, CYCLE_C,  CYCLE_C,     BANK_LO,  CYCLE_C2);
-    uop_stor(BANK_LO, CYCLE_C2, &TEMP); print_fpga_buffer("CYCLE_C2 = ", &TEMP);
+	uop_calc(MUL, BANK_HI, CYCLE_R0X, CYCLE_T2,  BANK_LO, CYCLE_T5);
+	uop_calc(MUL, BANK_HI, CYCLE_R1X, CYCLE_T1,  BANK_LO, CYCLE_T2);
 
-    uop_calc(MUL, BANK_LO, CYCLE_C2, CONST_DELTA, BANK_HI,  CYCLE_C2_2);
-    uop_stor(BANK_HI, CYCLE_C2_2, &TEMP); print_fpga_buffer("CYCLE_C2_2 = ", &TEMP);
+	uop_calc(MUL, BANK_HI, CYCLE_R0Y, CYCLE_T4,  BANK_LO, CYCLE_T6);
+	uop_calc(MUL, BANK_HI, CYCLE_R1Y, CYCLE_T3,  BANK_LO, CYCLE_T4);
 
-    uop_calc(MUL, BANK_LO, CYCLE_A,  CYCLE_A,     BANK_HI,  CYCLE_A2);
-    uop_stor(BANK_HI, CYCLE_A2, &TEMP); print_fpga_buffer("CYCLE_A2 = ", &TEMP);
+	uop_calc(SUB, BANK_LO, CYCLE_T2,  CYCLE_T5,  BANK_HI, CYCLE_T7);
+	uop_calc(SUB, BANK_LO, CYCLE_T4,  CYCLE_T6,  BANK_HI, CYCLE_T8);
 
-    uop_calc(ADD, BANK_LO, CYCLE_D,  CYCLE_D,     BANK_HI,  CYCLE_T1);
-    uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP);
+	uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R1Z, BANK_HI, CYCLE_T1);
 
-    uop_calc(SUB, BANK_HI, CYCLE_A2, CYCLE_T1,    BANK_LO,  CYCLE_SX);
-    uop_stor(BANK_LO, CYCLE_SX, &TEMP); print_fpga_buffer("CYCLE_SX = ", &TEMP);
+	uop_move(     BANK_HI, CYCLE_T1,             BANK_LO, CYCLE_T1);
+	uop_move(     BANK_HI, CYCLE_T7,             BANK_LO, CYCLE_T7);
 
-    uop_calc(SUB, BANK_LO, CYCLE_D,  CYCLE_SX,    BANK_HI,  CYCLE_T1);
-    uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP);
+	uop_calc(MUL, BANK_LO, CYCLE_T7,  CYCLE_T1,  BANK_HI, CYCLE_SZ);
 
-    uop_move(     BANK_HI, CYCLE_T1, BANK_LO,     CYCLE_T1);
+	uop_calc(MUL, BANK_HI, CYCLE_T8,  CYCLE_T8,  BANK_LO, CYCLE_T2);
+	uop_calc(MUL, BANK_LO, CYCLE_T7,  CYCLE_T7,  BANK_HI, CYCLE_T3);
+	uop_calc(MUL, BANK_HI, CYCLE_T7,  CYCLE_T3,  BANK_LO, CYCLE_T4);
 
-    uop_calc(MUL, BANK_LO, CYCLE_A,  CYCLE_T1,    BANK_HI,  CYCLE_T2);
-    uop_stor(BANK_HI, CYCLE_T2, &TEMP); print_fpga_buffer("CYCLE_T2 = ", &TEMP);
+	uop_calc(SUB, BANK_LO, CYCLE_T2,  CYCLE_T4,  BANK_HI, CYCLE_T1);
 
-    uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_C2_2,  BANK_LO,  CYCLE_SY);
-    uop_stor(BANK_LO, CYCLE_SY, &TEMP); print_fpga_buffer("CYCLE_SY = ", &TEMP);
+	uop_move(     BANK_LO, CYCLE_T5,             BANK_HI, CYCLE_T5);
 
-    /* END_MICROCODE */
-}
+	uop_calc(MUL, BANK_HI, CYCLE_T5,  CYCLE_T3,  BANK_LO, CYCLE_T2);
+	uop_calc(ADD, BANK_LO, CYCLE_T2,  CYCLE_T2,  BANK_HI, CYCLE_T3);
+	uop_calc(SUB, BANK_HI, CYCLE_T1,  CYCLE_T3,  BANK_LO, CYCLE_SX);
 
+	uop_calc(SUB, BANK_LO, CYCLE_T2,  CYCLE_SX,  BANK_HI, CYCLE_T1);
 
-//------------------------------------------------------------------------------
-//
-// Adds the base point G to the point stored in CYCLE_S* and stores the result
-// again in CYCLE_R*.
-//
-//------------------------------------------------------------------------------
-void fpga_curve_add_jacobian_microcode()
-{
-    //fpga_modular_mul(SZ, SZ,        A) ; //  3. A  = SZ * SZ
-    //fpga_modular_mul(A,  SZ,        B ); //  4. B  = A  * SZ
-    //fpga_modular_mul(A,  &ECDSA_GX, C ); //  5. C  = A  * GX
-    //fpga_modular_mul(B,  &ECDSA_GY, D ); //  6. D  = B  * GY
-    //fpga_modular_sub(C,  SX,        E ); //  7. E  = C  - SX
-    //fpga_modular_sub(D,  SY,        F ); //  8. F  = D  - SY
-    //fpga_modular_mul(E,  SZ,        RZ); // 10. RZ = E  * SZ [output]
-    //fpga_modular_mul(E,  E,         G ); // 11. G  = E  * E
-    //fpga_modular_mul(E,  G,         H ); // 12. H  = E  * G
-    //fpga_modular_mul(G,  SX,        J ); // 13. J  = G  * SX
-    //fpga_modular_add(J,  J,         T1); // 14. T1 = J  + J
-    //fpga_modular_mul(F,  F,         T2); // 15. T2 = F  * F
-    //fpga_modular_sub(T2, T1,        T3); // 16. T3 = T2 - T1
-    //fpga_modular_sub(T3, H,         RX); // 17. RX = T3 - H [output]
-    //fpga_modular_sub(J,  RX,        T1); // 18. T1 = J  - RX
-    //fpga_modular_mul(F,  T1,        T2); // 19. T2 = F  * T1
-    //fpga_modular_mul(H,  SY,        T3); // 20. T3 = H  * SY
-    //fpga_modular_sub(T2, T3,        RY); // 21. RY = T2 - T3 [output]
+	uop_move(     BANK_HI, CYCLE_T8,             BANK_LO, CYCLE_T8);
+	uop_move(     BANK_HI, CYCLE_T1,             BANK_LO, CYCLE_T1);
 
-    /* BEGIN_MICROCODE: CYCLE_ADD */
+	uop_calc(MUL, BANK_LO, CYCLE_T1,  CYCLE_T8,  BANK_HI, CYCLE_T2);
+	uop_calc(MUL, BANK_LO, CYCLE_T6,  CYCLE_T4,  BANK_HI, CYCLE_T3);
+	uop_calc(SUB, BANK_HI, CYCLE_T2,  CYCLE_T3,  BANK_LO, CYCLE_SY);
 
-    uop_cmpz(     BANK_HI, CYCLE_SZ);
-    uop_move(     BANK_HI, CYCLE_SZ, BANK_LO,  CYCLE_SZ);
-    uop_calc(MUL, BANK_LO, CYCLE_SZ, CYCLE_SZ, BANK_HI,  CYCLE_A);
-    uop_calc(MUL, BANK_HI, CYCLE_A,  CYCLE_SZ, BANK_LO,  CYCLE_B);
-    uop_move(     BANK_LO, CYCLE_B,  BANK_HI,  CYCLE_B);
-    uop_calc(MUL, BANK_HI, CYCLE_A,  CONST_GX, BANK_LO,  CYCLE_C);
-    uop_calc(MUL, BANK_HI, CYCLE_B,  CONST_GY, BANK_LO,  CYCLE_D);
-    uop_calc(SUB, BANK_LO, CYCLE_C,  CYCLE_SX, BANK_HI,  CYCLE_E);
-    uop_calc(SUB, BANK_LO, CYCLE_D,  CYCLE_SY, BANK_HI,  CYCLE_F);
-    uop_cmpz(     BANK_HI, CYCLE_E);
-    uop_cmpz(     BANK_HI, CYCLE_F);
-    uop_calc(MUL, BANK_HI, CYCLE_E,  CYCLE_SZ, BANK_LO,  CYCLE_RZ);
-    uop_calc(MUL, BANK_HI, CYCLE_E,  CYCLE_E,  BANK_LO,  CYCLE_G);
-    uop_move(     BANK_LO, CYCLE_G,  BANK_HI,  CYCLE_G);
-    uop_calc(MUL, BANK_HI, CYCLE_E,  CYCLE_G,  BANK_LO,  CYCLE_H);
-    uop_calc(MUL, BANK_LO, CYCLE_G,  CYCLE_SX, BANK_HI,  CYCLE_J);
-    uop_calc(ADD, BANK_HI, CYCLE_J,  CYCLE_J,  BANK_LO,  CYCLE_T1);
-    uop_calc(MUL, BANK_HI, CYCLE_F,  CYCLE_F,  BANK_LO,  CYCLE_T2);
-    uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_T1, BANK_HI,  CYCLE_T3);
-    uop_move(     BANK_HI, CYCLE_T3, BANK_LO,  CYCLE_T3);
-    uop_calc(SUB, BANK_LO, CYCLE_T3, CYCLE_H,  BANK_HI,  CYCLE_RX);
-    uop_calc(SUB, BANK_HI, CYCLE_J,  CYCLE_RX, BANK_LO,  CYCLE_T1);
-    uop_move(     BANK_HI, CYCLE_F,  BANK_LO,  CYCLE_F);
-    uop_calc(MUL, BANK_LO, CYCLE_F,  CYCLE_T1, BANK_HI,  CYCLE_T2);
-    uop_calc(MUL, BANK_LO, CYCLE_H,  CYCLE_SY, BANK_HI,  CYCLE_T3);
-    uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_T3, BANK_LO,  CYCLE_RY);
-    uop_move(     BANK_LO, CYCLE_RY, BANK_HI,  CYCLE_RY);
+	uop_cmpz(BANK_LO, CYCLE_R0Z);
+	uop_cmpz(BANK_LO, CYCLE_R1Z);
 
     /* END_MICROCODE */
 
     //
     // handle special corner cases
     //
-    if (uop_flagz_sz)
+
+    if (uop_flagz_r0z && !uop_flagz_r1z)
     {
-        /* BEGIN_MICROCODE: CYCLE_ADD_AT_INFINITY */
+        /* BEGIN_MICROCODE: CYCLE_ADD_R0_AT_INFINITY */
 
-        uop_move(BANK_LO, CONST_GX,  BANK_HI, CYCLE_RX);
-        uop_move(BANK_LO, CONST_GY,  BANK_HI, CYCLE_RY);
-        uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_RZ);
+        uop_move(BANK_HI, CYCLE_R1X, BANK_LO, CYCLE_SX);
+        uop_move(BANK_HI, CYCLE_R1Y, BANK_LO, CYCLE_SY);
+        uop_move(BANK_LO, CYCLE_R1Z, BANK_HI, CYCLE_SZ);
 
         /* END_MICROCODE */
+
+		return;
     }
-    else
+
+    if (!uop_flagz_r0z && uop_flagz_r1z)
     {
-        if (uop_flagz_e)
-        {
-            if (uop_flagz_f)
-            {
-                /* BEGIN_MICROCODE: CYCLE_ADD_SAME_X_SAME_Y */
+        /* BEGIN_MICROCODE: CYCLE_ADD_R1_AT_INFINITY */
 
-                uop_move(BANK_LO, CONST_HX,  BANK_HI, CYCLE_RX);
-                uop_move(BANK_LO, CONST_HY,  BANK_HI, CYCLE_RY);
-                uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_RZ);
+        uop_move(BANK_HI, CYCLE_R0X, BANK_LO, CYCLE_SX);
+        uop_move(BANK_HI, CYCLE_R0Y, BANK_LO, CYCLE_SY);
+        uop_move(BANK_LO, CYCLE_R0Z, BANK_HI, CYCLE_SZ);
 
-                /* END_MICROCODE */
-            }
-            else
-            {
-                /* BEGIN_MICROCODE: CYCLE_ADD_SAME_X */
+        /* END_MICROCODE */
 
-                uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_RX);
-                uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_RY);
-                uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_RZ);
+		return;
+    }
 
-                /* END_MICROCODE */
-            }
-        }
-        else
-        {
-            /* BEGIN_MICROCODE: CYCLE_ADD_REGULAR */
+	/* BEGIN_MICROCODE: CYCLE_ADD_REGULAR */
 
-            uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_T1);
-            uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_T2);
-            uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_T3);
+    uop_move(BANK_LO, CONST_GX,  BANK_HI, CYCLE_SX);
+    uop_move(BANK_LO, CONST_GY,  BANK_HI, CYCLE_SY);
+	uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_SZ);
 
-            /* END_MICROCODE */
-        }
-    }
+	/* END_MICROCODE */
 }
 
 
@@ -262,6 +243,13 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
 	FPGA_WORD k_word;
 	bool k_bit;
 
+#ifdef DUMP_CYCLE_STATES
+    FPGA_BUFFER r0x, r0y, r0z;
+    FPGA_BUFFER r1x, r1y, r1z;
+    FPGA_BUFFER sx,  sy,  sz;
+    FPGA_BUFFER tx,  ty,  tz;
+#endif
+
     // initialize internal banks
     fpga_multiword_copy(&ECDSA_ZERO,  &BUF_LO[CONST_ZERO]);
     fpga_multiword_copy(&ECDSA_ZERO,  &BUF_HI[CONST_ZERO]);
@@ -278,61 +266,110 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
     fpga_multiword_copy(&ECDSA_GY, &BUF_LO[CONST_GY]);
     fpga_multiword_copy(&ECDSA_GY, &BUF_HI[CONST_GY]);
 
-    fpga_multiword_copy(&ECDSA_HX, &BUF_LO[CONST_HX]);
-    fpga_multiword_copy(&ECDSA_HX, &BUF_HI[CONST_HX]);
-
-    fpga_multiword_copy(&ECDSA_HY, &BUF_LO[CONST_HY]);
-    fpga_multiword_copy(&ECDSA_HY, &BUF_HI[CONST_HY]);
-
     /* BEGIN_MICROCODE: PREPARE */
     
-    // set initial value of R to point at infinity
-	uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_RX);
-	uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_RY);
-	uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_RZ);
+    // set initial value of R0 to point at infinity
+    // set initial value of R1 to the base point
+	
+    uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_R0X);
+	uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_R0Y);
+	uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_R0Z);
+
+    uop_move(BANK_LO, CONST_GX,  BANK_HI, CYCLE_R1X);
+	uop_move(BANK_LO, CONST_GY,  BANK_HI, CYCLE_R1Y);
+	uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_R1Z);
 
     /* END_MICROCODE */
 
+
         /* process bits of k left-to-right */
     for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--)
         for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
         {
 			k_word = k->words[word_count-1];
-			k_bit = (k_word & (FPGA_WORD)(1 << (bit_count-1))) > 0;            
+			k_bit = (k_word & (FPGA_WORD)(1 << (bit_count-1))) > 0;
 
-            // Banks of working cycle operands
-            // -------------------------------
-            // RX: HI
-            // RY: HI
-            // RZ: LO
+#ifdef DUMP_CYCLE_STATES
+            dump_cycle_header(word_count, bit_count, k_bit);
+#endif
 
-            // calculate S = 2 * R
-            fpga_curve_double_jacobian_microcode();
+            //
+			// calculate S = R0 + R1
+			//
 
             // Banks of working cycle operands
             // -------------------------------
+            // R0|1X: HI
+            // R0|1Y: HI
+            // R0|1Z: LO
+
             // SX: LO
             // SY: LO
             // SZ: HI
 
-            // always calculate R = S * G for constant-time operation
-            fpga_curve_add_jacobian_microcode();
+            fpga_curve_add_jacobian_microcode_2();
+
+			//
+            // calculate T = 2 * R0 or T = 2 * R1
+			//
 
             // Banks of working cycle operands
             // -------------------------------
-            // RX: HI
-            // RY: HI
-            // RZ: LO
+            // R0|1X: HI
+            // R0|1Y: HI
+            // R0|1Z: LO
+
+            // TX: LO
+            // TY: LO
+            // TZ: HI
 
+            if (!k_bit)
+                fpga_curve_double_jacobian_microcode_r0();
+            else
+                fpga_curve_double_jacobian_microcode_r1();
+
+            //
+            // dump cycle state
+            //
+#ifdef DUMP_CYCLE_STATES
             
+            uop_stor(BANK_HI, CYCLE_R0X, &r0x);
+            uop_stor(BANK_HI, CYCLE_R0Y, &r0y);
+            uop_stor(BANK_LO, CYCLE_R0Z, &r0z);
+
+            uop_stor(BANK_HI, CYCLE_R1X, &r1x);
+            uop_stor(BANK_HI, CYCLE_R1Y, &r1y);
+            uop_stor(BANK_LO, CYCLE_R1Z, &r1z);
+
+            uop_stor(BANK_LO, CYCLE_SX, &sx);
+            uop_stor(BANK_LO, CYCLE_SY, &sy);
+            uop_stor(BANK_HI, CYCLE_SZ, &sz);
+
+            uop_stor(BANK_LO, CYCLE_TX, &tx);
+            uop_stor(BANK_LO, CYCLE_TY, &ty);
+            uop_stor(BANK_HI, CYCLE_TZ, &tz);
+
+            dump_cycle_state(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z,
+                             &sx,  &sy,  &sz,  &tx,  &ty,  &tz);
+#endif
+
+            //
+            // update working variables
+            //
 			if (!k_bit)
             {
                 /* BEGIN_MICROCODE: CYCLE_K0 */
 
-                // revert to the value of S before addition if the current bit of k is not set
-                uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_RX);
-                uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_RY);
-                uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_RZ);
+                // R0 = 2 * R0 (double)
+                // R1 = R0 + R1 (add)
+
+                uop_move(BANK_LO, CYCLE_TX, BANK_HI, CYCLE_R0X);
+                uop_move(BANK_LO, CYCLE_TY, BANK_HI, CYCLE_R0Y);
+                uop_move(BANK_HI, CYCLE_TZ, BANK_LO, CYCLE_R0Z);
+
+				uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_R1X);
+                uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_R1Y);
+                uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_R1Z);
 
                 /* END_MICROCODE */
             }
@@ -340,74 +377,20 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
             {
                 /* BEGIN_MICROCODE: CYCLE_K1 */
 
-                // do dummy overwrite for constant-time operation
-                uop_move(BANK_HI, CYCLE_RX, BANK_LO, CYCLE_SX);
-                uop_move(BANK_HI, CYCLE_RY, BANK_LO, CYCLE_SY);
-                uop_move(BANK_LO, CYCLE_RZ, BANK_HI, CYCLE_SZ);
+                // R0 = R0 + R1 (add)
+                // R1 = 2 * R1 (double)
+
+                uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_R0X);
+                uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_R0Y);
+                uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_R0Z);
+
+				uop_move(BANK_LO, CYCLE_TX, BANK_HI, CYCLE_R1X);
+                uop_move(BANK_LO, CYCLE_TY, BANK_HI, CYCLE_R1Y);
+                uop_move(BANK_HI, CYCLE_TZ, BANK_LO, CYCLE_R1Z);
 
                 /* END_MICROCODE */
             }
 
-            FPGA_BUFFER TEMP;
-
-            //printf("wc = %d, bc = %d\n", word_count-1, bit_count-1);
-
-            uop_stor(BANK_LO, CYCLE_RX,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RX   = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_RY,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RY   = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_RZ,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RZ   = ", &TEMP);
-
-            uop_stor(BANK_LO, CYCLE_SX,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SX   = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_SY,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SY   = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_SZ,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SZ   = ", &TEMP);
-
-            uop_stor(BANK_LO, CYCLE_A,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_A    = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_A2,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_A2   = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_B,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_B    = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_C,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C    = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_C2,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C2   = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_C2_2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C2_2 = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_D,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_D    = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_E,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_E    = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_F,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_F    = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_G,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_G    = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_H,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_H    = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_J,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_J    = ", &TEMP);
-
-            uop_stor(BANK_LO, CYCLE_Z2,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_Z2   = ", &TEMP);
-
-            uop_stor(BANK_LO, CYCLE_T1,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T1   = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_T2,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T2   = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_T3,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T3   = ", &TEMP);
-            uop_stor(BANK_LO, CYCLE_T4,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T4   = ", &TEMP);
-
-            uop_stor(BANK_HI, CYCLE_RX,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RX   = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_RY,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RY   = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_RZ,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RZ   = ", &TEMP);
-
-            uop_stor(BANK_HI, CYCLE_SX,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SX   = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_SY,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SY   = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_SZ,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SZ   = ", &TEMP);
-
-            uop_stor(BANK_HI, CYCLE_A,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_A    = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_A2,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_A2   = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_B,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_B    = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_C,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C    = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_C2,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C2   = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_C2_2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C2_2 = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_D,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_D    = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_E,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_E    = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_F,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_F    = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_G,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_G    = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_H,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_H    = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_J,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_J    = ", &TEMP);
-
-            uop_stor(BANK_HI, CYCLE_Z2,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_Z2   = ", &TEMP);
-
-            uop_stor(BANK_HI, CYCLE_T1,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T1   = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_T2,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T2   = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_T3,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T3   = ", &TEMP);
-            uop_stor(BANK_HI, CYCLE_T4,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T4   = ", &TEMP);
-
         }
 
     // now convert to affine coordinates
@@ -415,18 +398,18 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
 
     /* BEGIN_MICROCODE: CONVERT */
 
-    uop_calc(MUL, BANK_HI, INVERT_A2, CYCLE_RX, BANK_LO, CYCLE_SX);
-    uop_calc(MUL, BANK_HI, INVERT_A3, CYCLE_RY, BANK_LO, CYCLE_SY);
-    uop_cmpz(BANK_LO, CYCLE_RZ);
+    uop_calc(MUL, BANK_HI, INVERT_A2, CYCLE_R0X, BANK_LO, CYCLE_SX);
+    uop_calc(MUL, BANK_HI, INVERT_A3, CYCLE_R0Y, BANK_LO, CYCLE_SY);
+    uop_cmpz(BANK_LO, CYCLE_R0Z);
 
     /* END_MICROCODE */
 
-    if (uop_flagz_rz)
+    if (uop_flagz_r0z)
     {   
         /* BEGIN_MICROCODE: CONVERT_AT_INFINITY */
 
-        uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_RX);
-        uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_RY);
+        uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_R0X);
+        uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_R0Y);
 
         /* END_MICROCODE */
     }
@@ -434,15 +417,15 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
 	{
         /* BEGIN_MICROCODE: CONVERT_REGULAR */
 
-        uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_RX);
-        uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_RY);
+        uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_R0X);
+        uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_R0Y);
 
         /* END_MICROCODE */
 	}
 
     // return
-    uop_stor(BANK_HI, CYCLE_RX, qx);
-    uop_stor(BANK_HI, CYCLE_RY, qy);
+    uop_stor(BANK_HI, CYCLE_R0X, qx);
+    uop_stor(BANK_HI, CYCLE_R0Y, qy);
 }
 #endif USE_MICROCODE
 
@@ -456,36 +439,48 @@ void fpga_curve_double_jacobian_microcode_wrapper(const FPGA_BUFFER *rx,
                                                         FPGA_BUFFER *sz)
 //------------------------------------------------------------------------------
 {
-    uop_load(rx, BANK_HI, CYCLE_RX);
-    uop_load(ry, BANK_HI, CYCLE_RY);
-    uop_load(rz, BANK_LO, CYCLE_RZ);
+    //
+    // we have two pieces of microcode to double either R0 or R1 (this
+    // depends on the current multiplier bit), here we can just always
+    // use the one meant for R0
 
-    fpga_curve_double_jacobian_microcode();
+    uop_load(rx, BANK_HI, CYCLE_R0X);
+    uop_load(ry, BANK_HI, CYCLE_R0Y);
+    uop_load(rz, BANK_LO, CYCLE_R0Z);
 
-    uop_stor(BANK_LO, CYCLE_SX, sx);
-    uop_stor(BANK_LO, CYCLE_SY, sy);
-    uop_stor(BANK_HI, CYCLE_SZ, sz);
+    fpga_curve_double_jacobian_microcode_r0();
+
+    uop_stor(BANK_LO, CYCLE_TX, sx);
+    uop_stor(BANK_LO, CYCLE_TY, sy);
+    uop_stor(BANK_HI, CYCLE_TZ, sz);
 }
 
 
 //------------------------------------------------------------------------------
-void fpga_curve_add_jacobian_microcode_wrapper(const FPGA_BUFFER *sx,
-                                               const FPGA_BUFFER *sy,
-                                               const FPGA_BUFFER *sz,
-                                                     FPGA_BUFFER *rx,
-                                                     FPGA_BUFFER *ry,
-                                                     FPGA_BUFFER *rz)
+void fpga_curve_add_jacobian_microcode_2_wrapper(const FPGA_BUFFER *px,
+                                                 const FPGA_BUFFER *py,
+                                                 const FPGA_BUFFER *pz,
+                                                 const FPGA_BUFFER *qx,
+                                                 const FPGA_BUFFER *qy,
+                                                 const FPGA_BUFFER *qz,
+                                                       FPGA_BUFFER *rx,
+                                                       FPGA_BUFFER *ry,
+                                                       FPGA_BUFFER *rz)
 //------------------------------------------------------------------------------
 {
-    uop_load(sx, BANK_LO, CYCLE_SX);
-    uop_load(sy, BANK_LO, CYCLE_SY);
-    uop_load(sz, BANK_HI, CYCLE_SZ);
+    uop_load(px, BANK_HI, CYCLE_R0X);
+    uop_load(py, BANK_HI, CYCLE_R0Y);
+    uop_load(pz, BANK_LO, CYCLE_R0Z);
+
+    uop_load(qx, BANK_HI, CYCLE_R1X);
+    uop_load(qy, BANK_HI, CYCLE_R1Y);
+    uop_load(qz, BANK_LO, CYCLE_R1Z);
 
-    fpga_curve_add_jacobian_microcode();
+    fpga_curve_add_jacobian_microcode_2();
 
-    uop_stor(BANK_HI, CYCLE_RX, rx);
-    uop_stor(BANK_HI, CYCLE_RY, ry);
-    uop_stor(BANK_LO, CYCLE_RZ, rz);
+    uop_stor(BANK_HI, CYCLE_SX, rx);
+    uop_stor(BANK_HI, CYCLE_SY, ry);
+    uop_stor(BANK_LO, CYCLE_SZ, rz);
 }
 
 



More information about the Commits mailing list