From git at cryptech.is  Mon Aug 19 11:07:02 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:02 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] branch master updated
 (711ffbd -> 0beee22)
Message-ID: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a change to branch master
in repository user/shatov/modexpng.

    from 711ffbd  Simplified index calculation and accumulator clearing logic. Better debug printout of accumulators.
     new b5a8b52   * more debugging output  * more precise modelling of DSP slice
     new 766bb93  Rewrote "square" recombination to match how it works in hardware.
     new a105c87  Same changes for "triangle" multiplication phase as for the "square" one (debugging output, simpler MAC clearing and index rotation logic).
     new ac6bc69  Cosmetic fixes.
     new aaf45e2  Removed some boilerplate code, all the three multiplication flavours are now working consistently. Still need to rework recombination routines.
     new e79b4bb  Fixed 4096-bit test vector generation.
     new 345be75  Intermediate version to fix recombinaton overflow bug.
     new c165ddc  * Added more debugging options:  - intentionally trigger internal overflow handler  - dump MAC inputs  - dump intermediate numbers during the reduction phase
     new 66be583  * Started conversion of the model to use micro-operations
     new a5200cd  * Added more micro-operations
     new b0fb263  * MASSIVE CLEANUP
     new 0beee22  * More cleanup (got rid of .wide. and .narrow.)

The 12 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 modexpng_fpga_model.py  | 1496 ++++++++++++++++++++++++++++++++++++++---------
 vector/vector_format.py |    4 +
 2 files changed, 1238 insertions(+), 262 deletions(-)


From git at cryptech.is  Mon Aug 19 11:07:03 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:03 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 01/12: * more debugging
 output * more precise modelling of DSP slice
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110703.85B46992A02@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit b5a8b522c917633e0a0db034c1135453d40d8105
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Sat Mar 30 15:29:56 2019 +0300

     * more debugging output
     * more precise modelling of DSP slice
---
 modexpng_fpga_model.py | 56 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 18 deletions(-)
diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index 844cc86..c73532f 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -77,7 +77,9 @@ _VECTOR_CLASS = "Vector"
 DUMP_VECTORS = False
 DUMP_INDICES = False
 DUMP_MACS_CLEARING = False
-DUMP_MACS_ACCUMULATION = True
+DUMP_MACS_ACCUMULATION = False
+DUMP_MULT_PARTS = False
+DUMP_RCMB = True
 
 
 #
@@ -204,13 +206,15 @@ class ModExpNG_PartRecombinator():
             y |= (x & (1 << pos)) >> lsb
         return y
 
-    def _flush_pipeline(self):
+    def _flush_pipeline(self, dump):
         self.z0, self.y0, self.x0 = 0, 0, 0
+        if dump and DUMP_RCMB:
+            print("RCMB -> flush()")
 
-    def _push_pipeline(self, part):
+    def _push_pipeline(self, part, dump):
 
         # split next part into 16-bit words
-        z = self._bit_select(part, 47, 32)
+        z = self._bit_select(part, 46, 32)
         y = self._bit_select(part, 31, 16)
         x = self._bit_select(part, 15,  0)
 
@@ -225,16 +229,20 @@ class ModExpNG_PartRecombinator():
         # update internal latches
         self.z0, self.y0, self.x0 = z1, y1, x1
 
+        # dump
+        if dump and DUMP_RCMB:
+            print("RCMB -> push(): part = 0x%012x, word = 0x%04x" % (part, t))
+        
         # done
         return t
 
-    def recombine_square(self, parts, ab_num_words):
+    def recombine_square(self, parts, ab_num_words, dump):
 
         # empty result so far
         words = list()
 
         # flush recombinator pipeline
-        self._flush_pipeline()
+        self._flush_pipeline(dump)
 
         # the first tick produces null result, the last part produces
         # two words, so we need (2*n - 1) + 2 = 2*n + 1 ticks total
@@ -242,40 +250,40 @@ class ModExpNG_PartRecombinator():
         for i in range(2 * ab_num_words + 1):
 
             next_part = parts[i] if i < (2 * ab_num_words - 1) else 0
-            next_word = self._push_pipeline(next_part)
+            next_word = self._push_pipeline(next_part, dump)
 
             if i > 0:
                 words.append(next_word)
 
         return words
 
-    def recombine_triangle(self, parts, ab_num_words):
+    def recombine_triangle(self, parts, ab_num_words, dump):
 
         # empty result so far
         words = list()
 
         # flush recombinator pipeline
-        self._flush_pipeline()
+        self._flush_pipeline(dump)
 
         # the first tick produces null result, so we need n + 1 + 1 = n + 2
         # ticks total and should only save the result word during the last n ticks
         for i in range(ab_num_words + 2):
 
             next_part = parts[i] if i < (ab_num_words + 1) else 0
-            next_word = self._push_pipeline(next_part)
+            next_word = self._push_pipeline(next_part, dump)
 
             if i > 0:
                 words.append(next_word)
 
         return words
 
-    def recombine_rectangle(self, parts, ab_num_words):
+    def recombine_rectangle(self, parts, ab_num_words, dump):
 
         # empty result so far
         words = list()
 
         # flush recombinator pipeline
-        self._flush_pipeline()
+        self._flush_pipeline(dump)
 
         # the first tick produces null result, the last part produces
         # two words, so we need 2 * n + 2 ticks total and should only save
@@ -283,7 +291,7 @@ class ModExpNG_PartRecombinator():
         for i in range(2 * ab_num_words + 2):
 
             next_part = parts[i] if i < (2 * ab_num_words) else 0
-            next_word = self._push_pipeline(next_part)
+            next_word = self._push_pipeline(next_part, dump)
 
             if i > 0:
                 words.append(next_word)
@@ -341,6 +349,12 @@ class ModExpNG_WordMultiplier():
     def _rotate_index_aux(self):
         self._index_aux[0] -= 1
 
+    def _mult_store_part(self, parts, time, column, part_index, mac_index, dump):
+        parts[part_index] = self._macs[mac_index]
+        if dump and DUMP_MULT_PARTS:
+            print("t=%2d, col=%2d > parts[%2d]: mac[%d] = 0x%012x" %
+                (time, column, part_index, mac_index, parts[part_index]))
+                
     def multiply_square(self, a_wide, b_narrow, ab_num_words, dump=False):
 
         if dump: print("multiply_square()")
@@ -385,7 +399,11 @@ class ModExpNG_WordMultiplier():
                     self._update_one_mac(x, ax * bt)
 
                     if t == (col * NUM_MULTS + x):
-                        parts[t] = self._macs[x]
+                        part_index = t
+                        #self._mult_store_part(parts, t, col, part_index, self._macs[x], dump)
+                        self._mult_store_part(parts, t, col, part_index, x, dump)
+
+                            
 
                 if dump and DUMP_MACS_ACCUMULATION:
                     print("t=%2d, col=%2d > "% (t, col), end='')
@@ -399,7 +417,9 @@ class ModExpNG_WordMultiplier():
                 if t == (ab_num_words - 1):
                     for x in range(NUM_MULTS):
                         if not (col == (num_cols - 1) and x == (NUM_MULTS - 1)):
-                            parts[ab_num_words + col * NUM_MULTS + x] = self._macs[x]
+                            part_index = ab_num_words + col * NUM_MULTS + x
+                            #self._mult_store_part(parts, t, col, part_index, self._macs[x], dump)
+                            self._mult_store_part(parts, t, col, part_index, x, dump)
 
         return parts
 
@@ -627,7 +647,7 @@ class ModExpNG_Worker():
             ab = a
         else:
             ab_parts = self.multiplier.multiply_square(a, b, ab_num_words, dump)
-            ab_words = self.recombinator.recombine_square(ab_parts, ab_num_words)
+            ab_words = self.recombinator.recombine_square(ab_parts, ab_num_words, dump)
             ab = ModExpNG_Operand(None, 2 * ab_num_words, ab_words)
 
         if multiply_only:
@@ -635,12 +655,12 @@ class ModExpNG_Worker():
 
         # 2.
         q_parts = self.multiplier.multiply_triangle(ab, n_coeff, ab_num_words)
-        q_words = self.recombinator.recombine_triangle(q_parts, ab_num_words)
+        q_words = self.recombinator.recombine_triangle(q_parts, ab_num_words, dump)
         q = ModExpNG_Operand(None, ab_num_words + 1, q_words)
 
         # 3.
         m_parts = self.multiplier.multiply_rectangle(n, q, ab_num_words)
-        m_words = self.recombinator.recombine_rectangle(m_parts, ab_num_words)
+        m_words = self.recombinator.recombine_rectangle(m_parts, ab_num_words, dump)
         m = ModExpNG_Operand(None, 2 * ab_num_words + 1, m_words)
 
         # 4.


From git at cryptech.is  Mon Aug 19 11:07:04 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:04 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 02/12: Rewrote "square"
 recombination to match how it works in hardware.
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110704.0196B992A04@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit 766bb937c472d027b217216859d57b90e6bc6a6e
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Tue Apr 2 01:02:15 2019 +0300

    Rewrote "square" recombination to match how it works in hardware.
---
 modexpng_fpga_model.py | 94 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 72 insertions(+), 22 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index c73532f..73a21d3 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -238,24 +238,48 @@ class ModExpNG_PartRecombinator():
 
     def recombine_square(self, parts, ab_num_words, dump):
 
-        # empty result so far
-        words = list()
-
-        # flush recombinator pipeline
+        # empty results so far
+        words_lsb = list()  # n words
+        words_msb = list()  # n words
+                
+        # recombine the lower half (n parts)
+        # the first tick produces null result, the last part
+        # produces three words and needs two extra ticks
         self._flush_pipeline(dump)
-
-        # the first tick produces null result, the last part produces
-        # two words, so we need (2*n - 1) + 2 = 2*n + 1 ticks total
-        # and should only save the result word during the last 2 * n ticks
-        for i in range(2 * ab_num_words + 1):
-
-            next_part = parts[i] if i < (2 * ab_num_words - 1) else 0
+        for i in range(ab_num_words + 1 + 2):
+            next_part = parts[i] if i < ab_num_words else 0
             next_word = self._push_pipeline(next_part, dump)
-
+            
             if i > 0:
-                words.append(next_word)
-
+                words_lsb.append(next_word)
+       
+        # recombine the upper half (n-1 parts)
+        # the first tick produces null result
+        self._flush_pipeline(dump)
+        for i in range(ab_num_words + 1):
+            next_part = parts[i + ab_num_words] if i < (ab_num_words - 1) else 0
+            next_word = self._push_pipeline(next_part, dump)
+            
+            if i > 0:
+                words_msb.append(next_word)
+        
+        # merge words
+        words = list()
+        
+        # merge lower half
+        for x in range(ab_num_words):
+            next_word = words_lsb[x]
+            words.append(next_word)
+            
+        # merge upper half adding the two overlapping words
+        for x in range(ab_num_words):
+            next_word = words_msb[x]
+            if x < 2:
+                next_word += words_lsb[x + ab_num_words]
+            words.append(next_word)
+                    
         return words
+        
 
     def recombine_triangle(self, parts, ab_num_words, dump):
 
@@ -301,6 +325,9 @@ class ModExpNG_PartRecombinator():
 
 class ModExpNG_WordMultiplier():
 
+    _a_seen_17 = False
+    _b_seen_17 = False
+
     def __init__(self):
 
         self._macs = list()
@@ -326,8 +353,22 @@ class ModExpNG_WordMultiplier():
     def _clear_mac_aux(self):
         self._mac_aux[0] = 0
 
-    def _update_one_mac(self, x, value):
-        self._macs[x] += value
+    def _update_one_mac(self, x, a, b):
+    
+        if a > 0xFFFF:
+            self._a_seen_17 = True
+
+        if b > 0xFFFF:
+            self._b_seen_17 = True
+            
+        if a > 0x1FFFF:
+            raise("a > 0x1FFFF!")
+            
+        if b > 0x1FFFF:
+            raise("b > 0x1FFFF!")
+            
+        p = a * b
+        self._macs[x] += p
 
     def _update_mac_aux(self, value):
         self._mac_aux[0] += value
@@ -396,11 +437,10 @@ class ModExpNG_WordMultiplier():
                 # multiply by a-words
                 for x in range(NUM_MULTS):
                     ax = a_wide.words[self._indices[x]]
-                    self._update_one_mac(x, ax * bt)
+                    self._update_one_mac(x, ax, bt)
 
                     if t == (col * NUM_MULTS + x):
                         part_index = t
-                        #self._mult_store_part(parts, t, col, part_index, self._macs[x], dump)
                         self._mult_store_part(parts, t, col, part_index, x, dump)
 
                             
@@ -418,7 +458,6 @@ class ModExpNG_WordMultiplier():
                     for x in range(NUM_MULTS):
                         if not (col == (num_cols - 1) and x == (NUM_MULTS - 1)):
                             part_index = ab_num_words + col * NUM_MULTS + x
-                            #self._mult_store_part(parts, t, col, part_index, self._macs[x], dump)
                             self._mult_store_part(parts, t, col, part_index, x, dump)
 
         return parts
@@ -450,7 +489,7 @@ class ModExpNG_WordMultiplier():
                 # multiply by a-words
                 for x in range(NUM_MULTS):
                     ax = a_wide.words[self._indices[x]]
-                    self._update_one_mac(x, ax * bt)
+                    self._update_one_mac(x, ax, bt)
 
                     if t == (col * NUM_MULTS + x):
                         parts[t] = self._macs[x]
@@ -495,7 +534,7 @@ class ModExpNG_WordMultiplier():
                 # multiply by a-words
                 for x in range(NUM_MULTS):
                     ax = a_wide.words[self._indices[x]]
-                    self._update_one_mac(x, ax * bt)
+                    self._update_one_mac(x, ax, bt)
 
                     # don't save one value for the very last time instant per column
                     if t < ab_num_words and t == (col * NUM_MULTS + x):
@@ -750,10 +789,21 @@ if __name__ == "__main__":
 
     mp_blind_factor              = worker.multiply(mp_blind,                     vector.p_factor,  vector.p, vector.p_coeff, pq_num_words, dump=True)
     mq_blind_factor              = worker.multiply(mq_blind,                     vector.q_factor,  vector.q, vector.q_coeff, pq_num_words)
-
+    
     sp_blind_factor              = worker.exponentiate(ip_factor, mp_blind_factor, vector.dp, vector.p, vector.p_factor, vector.p_coeff, pq_num_words)
     sq_blind_factor              = worker.exponentiate(iq_factor, mq_blind_factor, vector.dq, vector.q, vector.q_factor, vector.q_coeff, pq_num_words)
 
+    if worker.multiplier._a_seen_17:
+        print("17-bit wide A's seen.")
+    else:
+        print("17-bit wide A's not detected.")
+
+    if worker.multiplier._b_seen_17:
+        print("17-bit wide B's seen.")
+    else:
+        print("17-bit wide B's not detected.")
+
+    
     sp_blind                     = worker.multiply(i,                            sp_blind_factor,  vector.p, vector.p_coeff, pq_num_words)
     sq_blind                     = worker.multiply(i,                            sq_blind_factor,  vector.q, vector.q_coeff, pq_num_words)
 


From git at cryptech.is  Mon Aug 19 11:07:05 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:05 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 03/12: Same changes for
 "triangle" multiplication phase as for the "square" one (debugging output,
 simpler MAC clearing and index rotation logic).
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110704.66CC5992A00@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit a105c876cb3b48375e860a03ee6edd18123b0e65
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Tue Apr 2 01:54:44 2019 +0300

    Same changes for "triangle" multiplication phase as for the "square" one
    (debugging output, simpler MAC clearing and index rotation logic).
---
 modexpng_fpga_model.py | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index 73a21d3..5632a6f 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -77,9 +77,9 @@ _VECTOR_CLASS = "Vector"
 DUMP_VECTORS = False
 DUMP_INDICES = False
 DUMP_MACS_CLEARING = False
-DUMP_MACS_ACCUMULATION = False
+DUMP_MACS_ACCUMULATION = True
 DUMP_MULT_PARTS = False
-DUMP_RCMB = True
+DUMP_RCMB = False
 
 
 #
@@ -395,6 +395,12 @@ class ModExpNG_WordMultiplier():
         if dump and DUMP_MULT_PARTS:
             print("t=%2d, col=%2d > parts[%2d]: mac[%d] = 0x%012x" %
                 (time, column, part_index, mac_index, parts[part_index]))
+
+    def _mult_store_part_aux(self, parts, time, column, part_index, mac_index, dump):
+        parts[part_index] = self._mac_aux[mac_index]
+        if dump and DUMP_MULT_PARTS:
+            print("t=%2d, col=%2d > parts[%2d]: mac_aux[%d] = 0x%012x" %
+                (time, column, part_index, mac_index, parts[part_index]))
                 
     def multiply_square(self, a_wide, b_narrow, ab_num_words, dump=False):
 
@@ -443,8 +449,6 @@ class ModExpNG_WordMultiplier():
                         part_index = t
                         self._mult_store_part(parts, t, col, part_index, x, dump)
 
-                            
-
                 if dump and DUMP_MACS_ACCUMULATION:
                     print("t=%2d, col=%2d > "% (t, col), end='')
                     for i in range(NUM_MULTS):
@@ -462,7 +466,9 @@ class ModExpNG_WordMultiplier():
 
         return parts
 
-    def multiply_triangle(self, a_wide, b_narrow, ab_num_words):
+    def multiply_triangle(self, a_wide, b_narrow, ab_num_words, dump=False):
+
+        if dump: print("multiply_triangle()")
 
         num_cols = ab_num_words // NUM_MULTS
 
@@ -474,15 +480,27 @@ class ModExpNG_WordMultiplier():
 
             last_col = col == (num_cols - 1)
 
-            self._clear_all_macs()
-            self._preset_indices(col)
-
-            if last_col:
-                self._clear_mac_aux()
-                self._preset_index_aux(num_cols)
-
             for t in range(ab_num_words + 1):
 
+                if t == 0:
+                    self._preset_indices(col)
+                    if last_col:
+                        self._preset_index_aux(num_cols)
+                else:
+                    self._rotate_indices(ab_num_words)
+                    if last_col:
+                        self._rotate_index_aux()
+            
+                if t == 0:
+                    self._clear_all_macs()
+                    if dump and DUMP_MACS_CLEARING:
+                        print("t= 0, col=%2d > clear > all" % (col))
+
+                    if last_col:
+                        self._clear_mac_aux()
+                        if dump and DUMP_MACS_CLEARING:
+                            print("t= 0, col=%2d > clear > aux" % (col))
+                        
                 # current b-word
                 bt = b_narrow.words[t]
 
@@ -492,7 +510,8 @@ class ModExpNG_WordMultiplier():
                     self._update_one_mac(x, ax, bt)
 
                     if t == (col * NUM_MULTS + x):
-                        parts[t] = self._macs[x]
+                        part_index = t
+                        self._mult_store_part(parts, t, col, part_index, x, dump)
 
                 # aux multiplier
                 if last_col:
@@ -500,17 +519,23 @@ class ModExpNG_WordMultiplier():
                     self._update_mac_aux(ax * bt)
 
                     if t == ab_num_words:
-                        parts[t] = self._mac_aux[0]
+                        part_index = t
+                        self._mult_store_part_aux(parts, t, col, part_index, 0, dump)
+
+                if dump and DUMP_MACS_ACCUMULATION:
+                    print("t=%2d, col=%2d > "% (t, col), end='')
+                    for i in range(NUM_MULTS):
+                        if i > 0: print(" | ", end='')
+                        print("mac[%d]: 0x%012x" % (i, self._macs[i]), end='')
+                    if last_col:
+                        print(" | mac_aux[ 0]: 0x%012x" % (self._mac_aux[0]), end='')
+                    print("")
 
+                        
                 # shortcut
                 if not last_col:
                     if t == (NUM_MULTS * (col + 1) - 1): break
 
-                # advance indices
-                self._rotate_indices(ab_num_words)
-                if last_col:
-                    self._rotate_index_aux()
-
         return parts
 
     def multiply_rectangle(self, a_wide, b_narrow, ab_num_words):
@@ -693,7 +718,7 @@ class ModExpNG_Worker():
             return ModExpNG_Operand(None, 2*ab_num_words, ab_words)
 
         # 2.
-        q_parts = self.multiplier.multiply_triangle(ab, n_coeff, ab_num_words)
+        q_parts = self.multiplier.multiply_triangle(ab, n_coeff, ab_num_words, dump)
         q_words = self.recombinator.recombine_triangle(q_parts, ab_num_words, dump)
         q = ModExpNG_Operand(None, ab_num_words + 1, q_words)
 


From git at cryptech.is  Mon Aug 19 11:07:06 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:06 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 04/12: Cosmetic fixes.
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110704.C5294992A06@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit ac6bc69356f2bbc97e20064380045b0305a3d0ed
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Tue Apr 2 13:31:14 2019 +0300

    Cosmetic fixes.
---
 modexpng_fpga_model.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index 5632a6f..cf4d7f8 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -284,22 +284,21 @@ class ModExpNG_PartRecombinator():
     def recombine_triangle(self, parts, ab_num_words, dump):
 
         # empty result so far
-        words = list()
-
-        # flush recombinator pipeline
-        self._flush_pipeline(dump)
+        words_lsb = list()
 
+        # recombine the lower half (n+1 parts)
         # the first tick produces null result, so we need n + 1 + 1 = n + 2
         # ticks total and should only save the result word during the last n ticks
+        self._flush_pipeline(dump)
         for i in range(ab_num_words + 2):
 
             next_part = parts[i] if i < (ab_num_words + 1) else 0
             next_word = self._push_pipeline(next_part, dump)
 
             if i > 0:
-                words.append(next_word)
+                words_lsb.append(next_word)
 
-        return words
+        return words_lsb
 
     def recombine_rectangle(self, parts, ab_num_words, dump):
 


From git at cryptech.is  Mon Aug 19 11:07:07 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:07 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 05/12: Removed some
 boilerplate code,
 all the three multiplication flavours are now working consistently. Still
 need to rework recombination routines.
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110705.3A10B992A02@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit aaf45e285cb15df841bfc3e84c92be9bdd90683e
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Tue Apr 2 14:16:49 2019 +0300

    Removed some boilerplate code, all the three multiplication flavours are now
    working consistently. Still need to rework recombination routines.
---
 modexpng_fpga_model.py | 197 +++++++++++++++++++++++++++++--------------------
 1 file changed, 117 insertions(+), 80 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index cf4d7f8..0726eaa 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -77,9 +77,9 @@ _VECTOR_CLASS = "Vector"
 DUMP_VECTORS = False
 DUMP_INDICES = False
 DUMP_MACS_CLEARING = False
-DUMP_MACS_ACCUMULATION = True
+DUMP_MACS_ACCUMULATION = False
 DUMP_MULT_PARTS = False
-DUMP_RCMB = False
+DUMP_RCMB = True
 
 
 #
@@ -232,7 +232,7 @@ class ModExpNG_PartRecombinator():
         # dump
         if dump and DUMP_RCMB:
             print("RCMB -> push(): part = 0x%012x, word = 0x%04x" % (part, t))
-        
+
         # done
         return t
 
@@ -241,7 +241,7 @@ class ModExpNG_PartRecombinator():
         # empty results so far
         words_lsb = list()  # n words
         words_msb = list()  # n words
-                
+
         # recombine the lower half (n parts)
         # the first tick produces null result, the last part
         # produces three words and needs two extra ticks
@@ -249,37 +249,37 @@ class ModExpNG_PartRecombinator():
         for i in range(ab_num_words + 1 + 2):
             next_part = parts[i] if i < ab_num_words else 0
             next_word = self._push_pipeline(next_part, dump)
-            
+
             if i > 0:
                 words_lsb.append(next_word)
-       
+
         # recombine the upper half (n-1 parts)
         # the first tick produces null result
         self._flush_pipeline(dump)
         for i in range(ab_num_words + 1):
             next_part = parts[i + ab_num_words] if i < (ab_num_words - 1) else 0
             next_word = self._push_pipeline(next_part, dump)
-            
+
             if i > 0:
                 words_msb.append(next_word)
-        
+
         # merge words
         words = list()
-        
+
         # merge lower half
         for x in range(ab_num_words):
             next_word = words_lsb[x]
             words.append(next_word)
-            
+
         # merge upper half adding the two overlapping words
         for x in range(ab_num_words):
             next_word = words_msb[x]
             if x < 2:
                 next_word += words_lsb[x + ab_num_words]
             words.append(next_word)
-                    
+
         return words
-        
+
 
     def recombine_triangle(self, parts, ab_num_words, dump):
 
@@ -342,30 +342,39 @@ class ModExpNG_WordMultiplier():
         self._mac_aux.append(0)
         self._index_aux.append(0)
 
-    def _clear_all_macs(self):
+    def _clear_all_macs(self, t, col, dump):
         for x in range(NUM_MULTS):
             self._macs[x] = 0
+        if dump and DUMP_MACS_CLEARING:
+            print("t=%2d, col=%2d > clear > all" % (t, col))
+
 
-    def _clear_one_mac(self, x):
+    def _clear_one_mac(self, x, t, col, dump):
         self._macs[x] = 0
+        if dump and DUMP_MACS_CLEARING:
+            print("t=%2d, col=%2d > clear > x=%d" % (t, col, x))
 
-    def _clear_mac_aux(self):
+
+    def _clear_mac_aux(self, t, col, dump):
         self._mac_aux[0] = 0
+        if dump and DUMP_MACS_CLEARING:
+            print("t= 0, col=%2d > clear > aux" % (col))
+
 
     def _update_one_mac(self, x, a, b):
-    
+
         if a > 0xFFFF:
             self._a_seen_17 = True
 
         if b > 0xFFFF:
             self._b_seen_17 = True
-            
+
         if a > 0x1FFFF:
             raise("a > 0x1FFFF!")
-            
+
         if b > 0x1FFFF:
             raise("b > 0x1FFFF!")
-            
+
         p = a * b
         self._macs[x] += p
 
@@ -379,6 +388,35 @@ class ModExpNG_WordMultiplier():
     def _preset_index_aux(self, num_cols):
         self._index_aux[0] = num_cols * len(self._indices)
 
+    def _dump_macs_helper(self, t, col, aux=False):
+        print("t=%2d, col=%2d > "% (t, col), end='')
+        for i in range(NUM_MULTS):
+            if i > 0: print(" | ", end='')
+            print("mac[%d]: 0x%012x" % (i, self._macs[i]), end='')
+        if aux:
+            print(" | mac_aux[ 0]: 0x%012x" % (self._mac_aux[0]), end='')
+        print("")
+
+    def _dump_macs(self, t, col):
+        self._dump_macs_helper(t, col)
+
+    def _dump_macs_aux(self, t, col):
+        self._dump_macs_helper(t, col, True)
+
+    def _dump_indices_helper(self, t, col, aux=False):
+        print("t=%2d, col=%2d > indices:" % (t, col), end='')
+        for i in range(NUM_MULTS):
+            print(" %2d" % self._indices[i], end='')
+        if aux:
+            print(" %2d" % self._index_aux[0], end='')
+        print("")
+
+    def _dump_indices(self, t, col):
+        self._dump_indices_helper(t, col)
+
+    def _dump_indices_aux(self, t, col):
+        self._dump_indices_helper(t, col, True)
+
     def _rotate_indices(self, num_words):
         for x in range(len(self._indices)):
             if self._indices[x] > 0:
@@ -400,7 +438,7 @@ class ModExpNG_WordMultiplier():
         if dump and DUMP_MULT_PARTS:
             print("t=%2d, col=%2d > parts[%2d]: mac_aux[%d] = 0x%012x" %
                 (time, column, part_index, mac_index, parts[part_index]))
-                
+
     def multiply_square(self, a_wide, b_narrow, ab_num_words, dump=False):
 
         if dump: print("multiply_square()")
@@ -415,26 +453,20 @@ class ModExpNG_WordMultiplier():
 
             for t in range(ab_num_words):
 
-                if t == 0: self._preset_indices(col)    
+                # take care of indices
+                if t == 0: self._preset_indices(col)
                 else:      self._rotate_indices(ab_num_words)
 
+                # take care of macs
                 if t == 0:
-                    self._clear_all_macs()
-                    if dump and DUMP_MACS_CLEARING:
-                        print("t= 0, col=%2d > clear > all" % (col))
+                    self._clear_all_macs(t, col, dump)
                 else:
                     t1 = t - 1
                     if (t1 // 8) == col:
-                        self._clear_one_mac(t1 % NUM_MULTS)
-                        if dump and DUMP_MACS_CLEARING:
-                            print("t=%2d, col=%2d > clear > x=%d:" % (t, col, t1 % NUM_MULTS))
-
+                        self._clear_one_mac(t1 % NUM_MULTS, t, col, dump)
 
-                if dump and DUMP_INDICES:
-                    print("t=%2d, col=%2d > indices:" % (t, col), end='')
-                    for i in range(NUM_MULTS):
-                        print(" %2d" % self._indices[i], end='')
-                    print("")
+                # debug output
+                if dump and DUMP_INDICES: self._dump_indices(t, col)
 
                 # current b-word
                 bt = b_narrow.words[t]
@@ -448,12 +480,8 @@ class ModExpNG_WordMultiplier():
                         part_index = t
                         self._mult_store_part(parts, t, col, part_index, x, dump)
 
-                if dump and DUMP_MACS_ACCUMULATION:
-                    print("t=%2d, col=%2d > "% (t, col), end='')
-                    for i in range(NUM_MULTS):
-                        if i > 0: print(" | ", end='')
-                        print("mac[%d]: 0x%012x" % (i, self._macs[i]), end='')
-                    print("")
+                # debug output
+                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs(t, col)
 
                 # save the uppers part of product at end of column,
                 # for the last column don't save the very last part
@@ -481,25 +509,25 @@ class ModExpNG_WordMultiplier():
 
             for t in range(ab_num_words + 1):
 
-                if t == 0:
-                    self._preset_indices(col)
-                    if last_col:
-                        self._preset_index_aux(num_cols)
-                else:
-                    self._rotate_indices(ab_num_words)
-                    if last_col:
-                        self._rotate_index_aux()
-            
-                if t == 0:
-                    self._clear_all_macs()
-                    if dump and DUMP_MACS_CLEARING:
-                        print("t= 0, col=%2d > clear > all" % (col))
-
-                    if last_col:
-                        self._clear_mac_aux()
-                        if dump and DUMP_MACS_CLEARING:
-                            print("t= 0, col=%2d > clear > aux" % (col))
-                        
+                # take care of indices
+                if t == 0: self._preset_indices(col)
+                else:      self._rotate_indices(ab_num_words)
+
+                # take care of auxilary index
+                if last_col:
+                    if t == 0: self._preset_index_aux(num_cols)
+                    else:      self._rotate_index_aux()
+
+                # take care of macs
+                if t == 0: self._clear_all_macs(t, col, dump)
+
+                # take care of auxilary mac
+                if last_col:
+                    if t == 0: self._clear_mac_aux(t, col, dump)
+
+                # debug output
+                if dump and DUMP_INDICES: self._dump_indices_aux(t, col)
+
                 # current b-word
                 bt = b_narrow.words[t]
 
@@ -521,23 +549,18 @@ class ModExpNG_WordMultiplier():
                         part_index = t
                         self._mult_store_part_aux(parts, t, col, part_index, 0, dump)
 
-                if dump and DUMP_MACS_ACCUMULATION:
-                    print("t=%2d, col=%2d > "% (t, col), end='')
-                    for i in range(NUM_MULTS):
-                        if i > 0: print(" | ", end='')
-                        print("mac[%d]: 0x%012x" % (i, self._macs[i]), end='')
-                    if last_col:
-                        print(" | mac_aux[ 0]: 0x%012x" % (self._mac_aux[0]), end='')
-                    print("")
+                # debug output
+                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs_aux(t, col)
 
-                        
                 # shortcut
                 if not last_col:
                     if t == (NUM_MULTS * (col + 1) - 1): break
 
         return parts
 
-    def multiply_rectangle(self, a_wide, b_narrow, ab_num_words):
+    def multiply_rectangle(self, a_wide, b_narrow, ab_num_words, dump=False):
+
+        if dump: print("multiply_rectangle()")
 
         num_cols = ab_num_words // NUM_MULTS
 
@@ -547,10 +570,22 @@ class ModExpNG_WordMultiplier():
 
         for col in range(num_cols):
 
-            self._clear_all_macs()
-            self._preset_indices(col)
+            for t in range(ab_num_words + 1):
 
-            for t in range(ab_num_words+1):
+                # take care of indices
+                if t == 0: self._preset_indices(col)
+                else:      self._rotate_indices(ab_num_words)
+
+                # take care of macs
+                if t == 0:
+                    self._clear_all_macs(t, col, dump)
+                else:
+                    t1 = t - 1
+                    if (t1 // 8) == col:
+                        self._clear_one_mac(t1 % NUM_MULTS, t, col, dump)
+
+                # debug output
+                if dump and DUMP_INDICES: self._dump_indices(t, col)
 
                 # current b-word
                 bt = b_narrow.words[t]
@@ -562,15 +597,17 @@ class ModExpNG_WordMultiplier():
 
                     # don't save one value for the very last time instant per column
                     if t < ab_num_words and t == (col * NUM_MULTS + x):
-                        parts[t] = self._macs[x]
-                        self._clear_one_mac(x)
+                        part_index = t
+                        self._mult_store_part(parts, t, col, part_index, x, dump)
+
+                # debug output
+                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs(t, col)
 
-                # save the uppers part of product at end of column
+                # save the upper parts of product at end of column
                 if t == ab_num_words:
                     for x in range(NUM_MULTS):
-                        parts[ab_num_words + col * NUM_MULTS + x] = self._macs[x]
-
-                self._rotate_indices(ab_num_words)
+                        part_index = ab_num_words + col * NUM_MULTS + x
+                        self._mult_store_part(parts, t, col, part_index, x, dump)
 
         return parts
 
@@ -722,7 +759,7 @@ class ModExpNG_Worker():
         q = ModExpNG_Operand(None, ab_num_words + 1, q_words)
 
         # 3.
-        m_parts = self.multiplier.multiply_rectangle(n, q, ab_num_words)
+        m_parts = self.multiplier.multiply_rectangle(n, q, ab_num_words, dump)
         m_words = self.recombinator.recombine_rectangle(m_parts, ab_num_words, dump)
         m = ModExpNG_Operand(None, 2 * ab_num_words + 1, m_words)
 
@@ -813,7 +850,7 @@ if __name__ == "__main__":
 
     mp_blind_factor              = worker.multiply(mp_blind,                     vector.p_factor,  vector.p, vector.p_coeff, pq_num_words, dump=True)
     mq_blind_factor              = worker.multiply(mq_blind,                     vector.q_factor,  vector.q, vector.q_coeff, pq_num_words)
-    
+
     sp_blind_factor              = worker.exponentiate(ip_factor, mp_blind_factor, vector.dp, vector.p, vector.p_factor, vector.p_coeff, pq_num_words)
     sq_blind_factor              = worker.exponentiate(iq_factor, mq_blind_factor, vector.dq, vector.q, vector.q_factor, vector.q_coeff, pq_num_words)
 
@@ -827,7 +864,7 @@ if __name__ == "__main__":
     else:
         print("17-bit wide B's not detected.")
 
-    
+
     sp_blind                     = worker.multiply(i,                            sp_blind_factor,  vector.p, vector.p_coeff, pq_num_words)
     sq_blind                     = worker.multiply(i,                            sq_blind_factor,  vector.q, vector.q_coeff, pq_num_words)
 


From git at cryptech.is  Mon Aug 19 11:07:08 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:08 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 06/12: Fixed 4096-bit
 test vector generation.
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110706.12449992A09@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit e79b4bbd96670fdfd71cf24f61c9712445ad4f0f
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Thu Apr 4 13:51:13 2019 +0300

    Fixed 4096-bit test vector generation.
---
 vector/vector_format.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vector/vector_format.py b/vector/vector_format.py
index 67a50f0..a3e7e81 100644
--- a/vector/vector_format.py
+++ b/vector/vector_format.py
@@ -34,6 +34,7 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 
+import sys
 import vector_util
 
 SCRIPT_USAGE = "USAGE: vector_format.py [openssl_binary]"
@@ -46,6 +47,9 @@ RNG_SEED_BLINDING = 2
 
 if __name__ == "__main__":
 
+    # ModInv fails otherwise...
+    sys.setrecursionlimit(int(1.5 * KEY_LENGTH))
+
     OPENSSL_BINARY = vector_util.openssl_binary(SCRIPT_USAGE)
 
     if len(OPENSSL_BINARY) > 0:


From git at cryptech.is  Mon Aug 19 11:07:09 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:09 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 07/12: Intermediate
 version to fix recombinaton overflow bug.
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110709.3AAC0992A51@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit 345be7560678db312767ae3a798123d7feb86003
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Thu Apr 4 13:52:07 2019 +0300

    Intermediate version to fix recombinaton overflow bug.
---
 modexpng_fpga_model.py | 119 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 92 insertions(+), 27 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index 0726eaa..d33f314 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -79,7 +79,7 @@ DUMP_INDICES = False
 DUMP_MACS_CLEARING = False
 DUMP_MACS_ACCUMULATION = False
 DUMP_MULT_PARTS = False
-DUMP_RCMB = True
+DUMP_RCMB = False
 
 
 #
@@ -119,7 +119,7 @@ class ModExpNG_Operand():
         for i in range(count):
 
             # word must not exceed 17 bits
-            if words[i] >= (2 ** (_WORD_WIDTH + 1)):
+            if words[i] >= (2 ** (_WORD_WIDTH + 2)):
                 raise Exception("Word is too large!")
 
         self.words = words
@@ -274,13 +274,12 @@ class ModExpNG_PartRecombinator():
         # merge upper half adding the two overlapping words
         for x in range(ab_num_words):
             next_word = words_msb[x]
-            if x < 2:
+            if x < 2:                
                 next_word += words_lsb[x + ab_num_words]
             words.append(next_word)
 
         return words
 
-
     def recombine_triangle(self, parts, ab_num_words, dump):
 
         # empty result so far
@@ -303,21 +302,62 @@ class ModExpNG_PartRecombinator():
     def recombine_rectangle(self, parts, ab_num_words, dump):
 
         # empty result so far
+        words_lsb = list()  # n words
+        words_msb = list()  # n+1 words
+
+        # recombine the lower half (n parts)
+        # the first tick produces null result, the last part
+        # produces three words and needs two extra ticks
+        self._flush_pipeline(dump)
+        for i in range(ab_num_words + 1 + 2):
+            next_part = parts[i] if i < ab_num_words else 0
+            next_word = self._push_pipeline(next_part, dump)
+
+            if i > 0:
+                words_lsb.append(next_word)
+
+        # recombine the upper half (n parts)
+        # the first tick produces null result, the last part
+        # produces two words and needs an extra tick
+        self._flush_pipeline(dump)
+        for i in range(ab_num_words + 2):
+            next_part = parts[i + ab_num_words] if i < ab_num_words else 0
+            next_word = self._push_pipeline(next_part, dump)
+
+            if i > 0:
+                words_msb.append(next_word)
+                
+        # merge words
         words = list()
 
+        # merge lower half
+        for x in range(ab_num_words):
+            next_word = words_lsb[x]
+            words.append(next_word)
+
+        # merge upper half adding the two overlapping words
+        for x in range(ab_num_words + 1):
+            next_word = words_msb[x]
+            if x < 2:
+                next_word += words_lsb[x + ab_num_words]
+            words.append(next_word)
+
+        return words
+
+                
         # flush recombinator pipeline
-        self._flush_pipeline(dump)
+        #self._flush_pipeline(dump)
 
         # the first tick produces null result, the last part produces
         # two words, so we need 2 * n + 2 ticks total and should only save
         # the result word during the last 2 * n + 1 ticks
-        for i in range(2 * ab_num_words + 2):
+        #for i in range(2 * ab_num_words + 2):
 
-            next_part = parts[i] if i < (2 * ab_num_words) else 0
-            next_word = self._push_pipeline(next_part, dump)
+            #next_part = parts[i] if i < (2 * ab_num_words) else 0
+            #next_word = self._push_pipeline(next_part, dump)
 
-            if i > 0:
-                words.append(next_word)
+            #if i > 0:
+                #words.append(next_word)
 
         return words
 
@@ -369,11 +409,11 @@ class ModExpNG_WordMultiplier():
         if b > 0xFFFF:
             self._b_seen_17 = True
 
-        if a > 0x1FFFF:
-            raise("a > 0x1FFFF!")
+        if a > 0x3FFFF:
+            raise Exception("a > 0x3FFFF!")
 
         if b > 0x1FFFF:
-            raise("b > 0x1FFFF!")
+            raise Exception("b > 0x1FFFF!")
 
         p = a * b
         self._macs[x] += p
@@ -451,6 +491,8 @@ class ModExpNG_WordMultiplier():
 
         for col in range(num_cols):
 
+            bt_carry = 0
+        
             for t in range(ab_num_words):
 
                 # take care of indices
@@ -469,7 +511,10 @@ class ModExpNG_WordMultiplier():
                 if dump and DUMP_INDICES: self._dump_indices(t, col)
 
                 # current b-word
-                bt = b_narrow.words[t]
+                bt = b_narrow.words[t] + bt_carry
+                bt_carry = bt >> _WORD_WIDTH
+                bt &= 0xffff
+                
 
                 # multiply by a-words
                 for x in range(NUM_MULTS):
@@ -659,6 +704,8 @@ class ModExpNG_LowlevelOperator():
 
 class ModExpNG_Worker():
 
+    max_zzz = 0
+
     def __init__(self):
         self.recombinator = ModExpNG_PartRecombinator()
         self.multiplier   = ModExpNG_WordMultiplier()
@@ -764,26 +811,40 @@ class ModExpNG_Worker():
         m = ModExpNG_Operand(None, 2 * ab_num_words + 1, m_words)
 
         # 4.
-        r_xwords = list()
-        for i in range(2*ab_num_words):
-            r_xwords.append(ab.words[i] + m.words[i])
-
-        r_xwords.append(m.words[2 * ab_num_words])
-
         cy = 0
-        for i in range(ab_num_words+1):
-            s = r_xwords[i] + cy
+        for i in range(ab_num_words + 1):
+            s = ab.words[i] + m.words[i] + cy
             cy = s >> 16
-
+            
         R = list()
         for i in range(ab_num_words):
             R.append(0)
 
-        R[0] += cy # !!!
-
+        R[0] = cy # !!! (cy is 2 bits, i.e. 0..3)
+        
+        if dump:
+            if ab.words[ab_num_words + 2] > 0:
+                ab.words[ab_num_words + 2] -= 1
+                ab.words[ab_num_words + 1] += 0x10000
+            if m.words[ab_num_words + 2] > 0:
+                m.words[ab_num_words + 2] -= 1
+                m.words[ab_num_words + 1] += 0x10000
+        
         for i in range(ab_num_words):
-            R[i] += r_xwords[ab_num_words + i + 1]
-
+            ab_word = ab.words[ab_num_words + i + 1] if i < (ab_num_words - 1) else 0
+            m_word = m.words[ab_num_words + i + 1]
+            
+            R[i] += ab_word + m_word
+
+            #if i == 0:
+                #if R[i] > self.max_zzz:
+                    #self.max_zzz = R[i]
+                    #print("self.max_zzz = %05x" % R[i])
+                #if R[i] > 0x1ffff:
+                    #sys.exit(123)
+                
+            
+                
         return ModExpNG_Operand(None, ab_num_words, R)
 
     def reduce(self, a):
@@ -816,6 +877,7 @@ if __name__ == "__main__":
     x_mutated_known = pow(vector.x.number(), 2, vector.n.number())
     y_mutated_known = pow(vector.y.number(), 2, vector.n.number())
 
+    
     # bring one into Montgomery domain (glue 2**r to one)
     # bring blinding coefficients into Montgomery domain (glue 2**(2*r) to x and y)
     # blind message
@@ -864,6 +926,7 @@ if __name__ == "__main__":
     else:
         print("17-bit wide B's not detected.")
 
+        
 
     sp_blind                     = worker.multiply(i,                            sp_blind_factor,  vector.p, vector.p_coeff, pq_num_words)
     sq_blind                     = worker.multiply(i,                            sq_blind_factor,  vector.q, vector.q_coeff, pq_num_words)
@@ -874,6 +937,8 @@ if __name__ == "__main__":
     sr_qinv_blind                = worker.multiply(sr_qinv_blind_inverse_factor, vector.p_factor,  vector.p, vector.p_coeff, pq_num_words)
     q_sr_qinv_blind              = worker.multiply(vector.q,                     sr_qinv_blind,    None,     None,           pq_num_words, multiply_only=True)
 
+    worker.reduce(q_sr_qinv_blind)
+    
     s_crt_blinded                = worker.add(sq_blind, q_sr_qinv_blind, pq_num_words)
 
     s_crt_unblinded              = worker.multiply(s_crt_blinded,                x_factor,         vector.n, vector.n_coeff, n_num_words)


From git at cryptech.is  Mon Aug 19 11:07:10 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:10 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 08/12: * Added more
 debugging options: - intentionally trigger internal overflow handler - dump
 MAC inputs - dump intermediate numbers during the reduction phase
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110710.13FB4992A09@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit c165ddceb00b9ba79e8cd238f9228736875dacb8
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Aug 19 13:44:28 2019 +0300

    * Added more debugging options:
     - intentionally trigger internal overflow handler
     - dump MAC inputs
     - dump intermediate numbers during the reduction phase
    
    * Bus widths changes
    
    * Some cosmetic changes
---
 modexpng_fpga_model.py | 335 +++++++++++++++++++++++++++----------------------
 1 file changed, 185 insertions(+), 150 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index d33f314..cc3e868 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -74,12 +74,15 @@ _VECTOR_CLASS = "Vector"
 # ------------------
 # Debugging Settings
 # ------------------
+FORCE_OVERFLOW = False
 DUMP_VECTORS = False
 DUMP_INDICES = False
+DUMP_MACS_INPUTS = False
 DUMP_MACS_CLEARING = False
 DUMP_MACS_ACCUMULATION = False
 DUMP_MULT_PARTS = False
 DUMP_RCMB = False
+DUMP_REDUCTION = False
 
 
 #
@@ -111,14 +114,14 @@ class ModExpNG_Operand():
             if i > 0:
                 if (i % 4) == 0: print("")
                 else:            print(" ", end='')
-            print("%s[%2d] = 17'h%05x;" % (name, i, self.words[i]), end='')
+            print("%s[%2d] = 18'h%05x;" % (name, i, self.words[i]), end='')
         print("")
 
     def _init_from_words(self, words, count):
 
         for i in range(count):
 
-            # word must not exceed 17 bits
+            # word must not exceed 18 bits
             if words[i] >= (2 ** (_WORD_WIDTH + 2)):
                 raise Exception("Word is too large!")
 
@@ -221,7 +224,7 @@ class ModExpNG_PartRecombinator():
         # shift to the right
         z1 = z
         y1 = y + self.z0
-        x1 = x + self.y0 + (self.x0 >> 16) # IMPORTANT: This carry can be up to two bits wide!!
+        x1 = x + self.y0 + (self.x0 >> _WORD_WIDTH) # IMPORTANT: This carry can be up to two bits wide!!
 
         # save lower 16 bits of the rightmost cell
         t = self.x0 & 0xffff
@@ -287,7 +290,8 @@ class ModExpNG_PartRecombinator():
 
         # recombine the lower half (n+1 parts)
         # the first tick produces null result, so we need n + 1 + 1 = n + 2
-        # ticks total and should only save the result word during the last n ticks
+        # ticks total and should only save the result word during the last
+        # n + 1 ticks
         self._flush_pipeline(dump)
         for i in range(ab_num_words + 2):
 
@@ -344,29 +348,9 @@ class ModExpNG_PartRecombinator():
 
         return words
 
-                
-        # flush recombinator pipeline
-        #self._flush_pipeline(dump)
-
-        # the first tick produces null result, the last part produces
-        # two words, so we need 2 * n + 2 ticks total and should only save
-        # the result word during the last 2 * n + 1 ticks
-        #for i in range(2 * ab_num_words + 2):
-
-            #next_part = parts[i] if i < (2 * ab_num_words) else 0
-            #next_word = self._push_pipeline(next_part, dump)
-
-            #if i > 0:
-                #words.append(next_word)
-
-        return words
-
 
 class ModExpNG_WordMultiplier():
 
-    _a_seen_17 = False
-    _b_seen_17 = False
-
     def __init__(self):
 
         self._macs = list()
@@ -388,38 +372,45 @@ class ModExpNG_WordMultiplier():
         if dump and DUMP_MACS_CLEARING:
             print("t=%2d, col=%2d > clear > all" % (t, col))
 
-
     def _clear_one_mac(self, x, t, col, dump):
         self._macs[x] = 0
         if dump and DUMP_MACS_CLEARING:
             print("t=%2d, col=%2d > clear > x=%d" % (t, col, x))
 
-
     def _clear_mac_aux(self, t, col, dump):
         self._mac_aux[0] = 0
         if dump and DUMP_MACS_CLEARING:
             print("t= 0, col=%2d > clear > aux" % (col))
 
+    def _update_one_mac(self, x, t, col, a, b, dump, need_aux=False):
 
-    def _update_one_mac(self, x, a, b):
-
-        if a > 0xFFFF:
-            self._a_seen_17 = True
+        if a > 0x3FFFF:
+            raise Exception("a > 0x3FFFF!")
 
         if b > 0xFFFF:
-            self._b_seen_17 = True
+            raise Exception("b > 0xFFFF!")
 
+        p = a * b
+        if dump and DUMP_MACS_INPUTS:
+            if x == 0: print("t=%2d, col=%2d > b=%05x > " % (t, col, b), end='')
+            if x > 0: print("; ", end='')
+            print("MAC[%d]: a=%05x" % (x, a), end='')
+            if x == (NUM_MULTS-1) and not need_aux: print("")
+            
+        self._macs[x] += p
+
+    def _update_mac_aux(self, y, col, a, b, dump):
+        
         if a > 0x3FFFF:
             raise Exception("a > 0x3FFFF!")
 
-        if b > 0x1FFFF:
-            raise Exception("b > 0x1FFFF!")
+        if b > 0xFFFF:
+            raise Exception("b > 0xFFFF!")
 
         p = a * b
-        self._macs[x] += p
-
-    def _update_mac_aux(self, value):
-        self._mac_aux[0] += value
+        if dump and DUMP_MACS_INPUTS:
+            print("; AUX: a=%05x" % a)
+        self._mac_aux[0] += p
 
     def _preset_indices(self, col):
         for x in range(len(self._indices)):
@@ -440,7 +431,7 @@ class ModExpNG_WordMultiplier():
     def _dump_macs(self, t, col):
         self._dump_macs_helper(t, col)
 
-    def _dump_macs_aux(self, t, col):
+    def _dump_macs_with_aux(self, t, col):
         self._dump_macs_helper(t, col, True)
 
     def _dump_indices_helper(self, t, col, aux=False):
@@ -454,7 +445,7 @@ class ModExpNG_WordMultiplier():
     def _dump_indices(self, t, col):
         self._dump_indices_helper(t, col)
 
-    def _dump_indices_aux(self, t, col):
+    def _dump_indices_with_aux(self, t, col):
         self._dump_indices_helper(t, col, True)
 
     def _rotate_indices(self, num_words):
@@ -473,16 +464,14 @@ class ModExpNG_WordMultiplier():
             print("t=%2d, col=%2d > parts[%2d]: mac[%d] = 0x%012x" %
                 (time, column, part_index, mac_index, parts[part_index]))
 
-    def _mult_store_part_aux(self, parts, time, column, part_index, mac_index, dump):
-        parts[part_index] = self._mac_aux[mac_index]
+    def _mult_store_part_aux(self, parts, time, column, part_index, dump):
+        parts[part_index] = self._mac_aux[0]
         if dump and DUMP_MULT_PARTS:
             print("t=%2d, col=%2d > parts[%2d]: mac_aux[%d] = 0x%012x" %
-                (time, column, part_index, mac_index, parts[part_index]))
+                (time, column, part_index, 0, parts[part_index]))
 
     def multiply_square(self, a_wide, b_narrow, ab_num_words, dump=False):
 
-        if dump: print("multiply_square()")
-
         num_cols = ab_num_words // NUM_MULTS
 
         parts = list()
@@ -490,8 +479,8 @@ class ModExpNG_WordMultiplier():
             parts.append(0)
 
         for col in range(num_cols):
-
-            bt_carry = 0
+        
+            b_carry = 0
         
             for t in range(ab_num_words):
 
@@ -511,15 +500,15 @@ class ModExpNG_WordMultiplier():
                 if dump and DUMP_INDICES: self._dump_indices(t, col)
 
                 # current b-word
-                bt = b_narrow.words[t] + bt_carry
-                bt_carry = bt >> _WORD_WIDTH
-                bt &= 0xffff
-                
+                # TODO: Explain how the 18th bit carry works!!
+                bt = b_narrow.words[t] + b_carry
+                b_carry = (bt & 0x30000) >> 16
+                bt &= 0xFFFF
 
                 # multiply by a-words
                 for x in range(NUM_MULTS):
                     ax = a_wide.words[self._indices[x]]
-                    self._update_one_mac(x, ax, bt)
+                    self._update_one_mac(x, t, col, ax, bt, dump)
 
                     if t == (col * NUM_MULTS + x):
                         part_index = t
@@ -540,8 +529,6 @@ class ModExpNG_WordMultiplier():
 
     def multiply_triangle(self, a_wide, b_narrow, ab_num_words, dump=False):
 
-        if dump: print("multiply_triangle()")
-
         num_cols = ab_num_words // NUM_MULTS
 
         parts = list()
@@ -571,7 +558,7 @@ class ModExpNG_WordMultiplier():
                     if t == 0: self._clear_mac_aux(t, col, dump)
 
                 # debug output
-                if dump and DUMP_INDICES: self._dump_indices_aux(t, col)
+                if dump and DUMP_INDICES: self._dump_indices_with_aux(t, col)
 
                 # current b-word
                 bt = b_narrow.words[t]
@@ -579,7 +566,7 @@ class ModExpNG_WordMultiplier():
                 # multiply by a-words
                 for x in range(NUM_MULTS):
                     ax = a_wide.words[self._indices[x]]
-                    self._update_one_mac(x, ax, bt)
+                    self._update_one_mac(x, t, col, ax, bt, dump, last_col)
 
                     if t == (col * NUM_MULTS + x):
                         part_index = t
@@ -588,14 +575,14 @@ class ModExpNG_WordMultiplier():
                 # aux multiplier
                 if last_col:
                     ax = a_wide.words[self._index_aux[0]]
-                    self._update_mac_aux(ax * bt)
+                    self._update_mac_aux(t, col, ax, bt, dump)
 
                     if t == ab_num_words:
                         part_index = t
-                        self._mult_store_part_aux(parts, t, col, part_index, 0, dump)
+                        self._mult_store_part_aux(parts, t, col, part_index, dump)
 
                 # debug output
-                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs_aux(t, col)
+                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs_with_aux(t, col)
 
                 # shortcut
                 if not last_col:
@@ -605,8 +592,6 @@ class ModExpNG_WordMultiplier():
 
     def multiply_rectangle(self, a_wide, b_narrow, ab_num_words, dump=False):
 
-        if dump: print("multiply_rectangle()")
-
         num_cols = ab_num_words // NUM_MULTS
 
         parts = list()
@@ -638,7 +623,7 @@ class ModExpNG_WordMultiplier():
                 # multiply by a-words
                 for x in range(NUM_MULTS):
                     ax = a_wide.words[self._indices[x]]
-                    self._update_one_mac(x, ax, bt)
+                    self._update_one_mac(x, t, col, ax, bt, dump)
 
                     # don't save one value for the very last time instant per column
                     if t < ab_num_words and t == (col * NUM_MULTS + x):
@@ -686,6 +671,7 @@ class ModExpNG_LowlevelOperator():
         return (sum_c, sum_s)
 
     def sub_words(self, a, b, b_in):
+    
         self._check_word(a)
         self._check_word(b)
         self._check_carry_borrow(b_in)
@@ -704,14 +690,12 @@ class ModExpNG_LowlevelOperator():
 
 class ModExpNG_Worker():
 
-    max_zzz = 0
-
     def __init__(self):
         self.recombinator = ModExpNG_PartRecombinator()
         self.multiplier   = ModExpNG_WordMultiplier()
         self.lowlevel     = ModExpNG_LowlevelOperator()
 
-    def exponentiate(self, iz, bz, e, n, n_factor, n_coeff, num_words):
+    def exponentiate(self, iz, bz, e, n, n_factor, n_coeff, num_words, dump_index=-1, dump_mode=""):
 
         # working variables
         t1, t2 = iz, bz
@@ -719,19 +703,51 @@ class ModExpNG_Worker():
         # length-1, length-2, length-3, ..., 1, 0 (left-to-right)
         for bit in range(_WORD_WIDTH * num_words - 1, -1, -1):
 
-            if e.number() & (1 << bit):
-                p1 = self.multiply(t1, t2, n, n_coeff, num_words)
-                p2 = self.multiply(t2, t2, n, n_coeff, num_words)
+            debug_dump = bit == dump_index
+
+            bit_value = (e.number() & (1 << bit)) >> bit
+            
+            if debug_dump:
+                print("\rladder_mode = %d" % bit_value)
+                
+                if FORCE_OVERFLOW:
+                    T1X = list(t1.words)
+                    for i in range(num_words):
+                        if i > 0:
+                            bits = T1X[i-1] & (3 << 16)
+                            if bits == 0:
+                                bits = T1X[i] & 3
+                                T1X[i] = T1X[i] ^ bits
+                                T1X[i-1] |= (bits << 16)
+                                    
+                    for i in range(num_words):
+                        t1.words[i] = T1X[i]
+                
+                if DUMP_VECTORS:
+                    print("num_words = %d" % num_words)
+                    t1.format_verilog_concat("%s_T1" % dump_mode)
+                    t2.format_verilog_concat("%s_T2" % dump_mode)
+                    n.format_verilog_concat("%s_N" % dump_mode)
+                    n_coeff.format_verilog_concat("%s_N_COEFF"  % dump_mode)
+                            # force the rarely seen overflow
+
+            if bit_value:
+                p1 = self.multiply(t1, t2, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="X")
+                p2 = self.multiply(t2, t2, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="Y")
             else:
-                p1 = self.multiply(t1, t1, n, n_coeff, num_words)
-                p2 = self.multiply(t2, t1, n, n_coeff, num_words)
+                p1 = self.multiply(t1, t1, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="X")
+                p2 = self.multiply(t2, t1, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="Y")
 
             t1, t2 = p1, p2
 
+            if debug_dump and DUMP_VECTORS:
+                t1.format_verilog_concat("%s_X" % dump_mode)
+                t2.format_verilog_concat("%s_Y" % dump_mode)
+
             if (bit % 8) == 0:
                 pct = float((_WORD_WIDTH * num_words - bit) / (_WORD_WIDTH * num_words)) * 100.0
                 print("\rpct: %5.1f%%" % pct, end='')
-
+        
         print("")
 
         return t1
@@ -780,16 +796,11 @@ class ModExpNG_Worker():
 
         return ModExpNG_Operand(None, 2*ab_num_words, ab)
 
-    def multiply(self, a, b, n, n_coeff, ab_num_words, reduce_only=False, multiply_only=False, dump=False):
+    def multiply(self, a, b, n, n_coeff, ab_num_words, reduce_only=False, multiply_only=False, dump=False, dump_mode="", dump_phase=""):
 
-        if dump and DUMP_VECTORS:
-            print("num_words = %d" % ab_num_words)
-            a.format_verilog_concat("A")
-            b.format_verilog_concat("B")
-            n.format_verilog_concat("N")
-            n_coeff.format_verilog_concat("N_COEFF")
-
-        # 1.
+        # 1. AB = A * B
+        if dump: print("multiply_square(%s_%s)" % (dump_mode, dump_phase))
+        
         if reduce_only:
             ab = a
         else:
@@ -797,61 +808,92 @@ class ModExpNG_Worker():
             ab_words = self.recombinator.recombine_square(ab_parts, ab_num_words, dump)
             ab = ModExpNG_Operand(None, 2 * ab_num_words, ab_words)
 
+        if dump and DUMP_VECTORS:
+            ab.format_verilog_concat("%s_%s_AB" % (dump_mode, dump_phase))
+
         if multiply_only:
             return ModExpNG_Operand(None, 2*ab_num_words, ab_words)
 
-        # 2.
+            
+        # 2. Q = LSB(AB) * N_COEFF
+        if dump: print("multiply_triangle(%s_%s)" % (dump_mode, dump_phase))
+        
         q_parts = self.multiplier.multiply_triangle(ab, n_coeff, ab_num_words, dump)
         q_words = self.recombinator.recombine_triangle(q_parts, ab_num_words, dump)
         q = ModExpNG_Operand(None, ab_num_words + 1, q_words)
 
-        # 3.
+        if dump and DUMP_VECTORS:
+            q.format_verilog_concat("%s_%s_Q" % (dump_mode, dump_phase))
+
+        # 3. M = Q * N
+        if dump: print("multiply_rectangle(%s_%s)" % (dump_mode, dump_phase))
+        
         m_parts = self.multiplier.multiply_rectangle(n, q, ab_num_words, dump)
         m_words = self.recombinator.recombine_rectangle(m_parts, ab_num_words, dump)
         m = ModExpNG_Operand(None, 2 * ab_num_words + 1, m_words)
+        
+        if dump and DUMP_VECTORS:
+            m.format_verilog_concat("%s_%s_M" % (dump_mode, dump_phase))
+
+        if (m.number() != (q.number() * n.number())):
+            print("MISMATCH")
+            sys.exit()
 
-        # 4.
-        cy = 0
+            
+        # 4. R = AB + M
+        
+        # 4a. compute carry (actual sum is all zeroes and need not be stored)
+        r_cy = 0 # this can be up to two bits, since we're adding extended words!!
         for i in range(ab_num_words + 1):
-            s = ab.words[i] + m.words[i] + cy
-            cy = s >> 16
+            s = ab.words[i] + m.words[i] + r_cy
+            r_cy_new = s >> 16
             
+            if dump and DUMP_REDUCTION:
+                print("[%2d] 0x%05x + 0x%05x + 0x%x => {0x%x, [0x%05x]}" %
+                    (i, ab.words[i], m.words[i], r_cy, r_cy_new, s & 0xffff))
+                
+            r_cy = r_cy_new
+        
+        
+        # 4b. Initialize empty result
         R = list()
         for i in range(ab_num_words):
             R.append(0)
 
-        R[0] = cy # !!! (cy is 2 bits, i.e. 0..3)
-        
-        if dump:
-            if ab.words[ab_num_words + 2] > 0:
-                ab.words[ab_num_words + 2] -= 1
-                ab.words[ab_num_words + 1] += 0x10000
-            if m.words[ab_num_words + 2] > 0:
-                m.words[ab_num_words + 2] -= 1
-                m.words[ab_num_words + 1] += 0x10000
-        
+        # 4c. compute the actual upper part of sum (take carry into account)
         for i in range(ab_num_words):
+        
+            if dump and DUMP_REDUCTION:
+                print("[%2d]" % i, end='')
+                
             ab_word = ab.words[ab_num_words + i + 1] if i < (ab_num_words - 1) else 0
+            if dump and DUMP_REDUCTION:
+                print(" 0x%05x" % ab_word, end='')
+                
             m_word = m.words[ab_num_words + i + 1]
-            
-            R[i] += ab_word + m_word
-
-            #if i == 0:
-                #if R[i] > self.max_zzz:
-                    #self.max_zzz = R[i]
-                    #print("self.max_zzz = %05x" % R[i])
-                #if R[i] > 0x1ffff:
-                    #sys.exit(123)
+            if dump and DUMP_REDUCTION:
+                print(" + 0x%05x" % m_word, end='')
                 
+            if i == 0: R[i] = r_cy
+            else:      R[i] = 0
             
+            if (r_cy > 3): print("\rR_CY = %d!" % r_cy)
+            
+            if dump and DUMP_REDUCTION:
+                print(" + 0x%x" % R[i], end='')
                 
+            R[i] += ab_word
+            R[i] += m_word
+            if dump and DUMP_REDUCTION:
+                print(" = 0x%05x" % R[i])
+                        
         return ModExpNG_Operand(None, ab_num_words, R)
 
     def reduce(self, a):
         carry = 0
         for x in range(len(a.words)):
             a.words[x] += carry
-            carry = (a.words[x] >> _WORD_WIDTH) & 1
+            carry = (a.words[x] >> _WORD_WIDTH) & 3
             a.words[x] &= self.lowlevel._word_mask
 
 
@@ -894,73 +936,66 @@ if __name__ == "__main__":
     #  s_crt = sq + q_sr_qinv
     # unblind s
     # mutate blinding factors
-    ip_factor                    = worker.multiply(i,                            vector.p_factor,  vector.p, vector.p_coeff, pq_num_words)
-    iq_factor                    = worker.multiply(i,                            vector.q_factor,  vector.q, vector.q_coeff, pq_num_words)
-
-    x_factor                     = worker.multiply(vector.x,                     vector.n_factor,  vector.n, vector.n_coeff, n_num_words)
-    y_factor                     = worker.multiply(vector.y,                     vector.n_factor,  vector.n, vector.n_coeff, n_num_words)
-
-    m_blind                      = worker.multiply(vector.m,                     y_factor,         vector.n, vector.n_coeff, n_num_words)
+    
+    XF  = worker.multiply(vector.x, vector.n_factor, vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
+    YF  = worker.multiply(vector.y, vector.n_factor, vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
 
-    worker.reduce(m_blind)
+    XMF = worker.multiply(XF,       XF,              vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
+    YMF = worker.multiply(YF,       YF,              vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
+    
+    XM  = worker.multiply(i,        XMF,             vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
+    YM  = worker.multiply(i,        YMF,             vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
 
-    mp_blind_inverse_factor      = worker.multiply(m_blind,                      None,             vector.p, vector.p_coeff, pq_num_words, reduce_only=True)
-    mq_blind_inverse_factor      = worker.multiply(m_blind,                      None,             vector.q, vector.q_coeff, pq_num_words, reduce_only=True)
+    MB  = worker.multiply(vector.m, YF,              vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
 
-    mp_blind                     = worker.multiply(mp_blind_inverse_factor,      vector.p_factor,  vector.p, vector.p_coeff, pq_num_words)
-    mq_blind                     = worker.multiply(mq_blind_inverse_factor,      vector.q_factor,  vector.q, vector.q_coeff, pq_num_words)
+    worker.reduce(MB) # just_reduce
 
-    mp_blind_factor              = worker.multiply(mp_blind,                     vector.p_factor,  vector.p, vector.p_coeff, pq_num_words, dump=True)
-    mq_blind_factor              = worker.multiply(mq_blind,                     vector.q_factor,  vector.q, vector.q_coeff, pq_num_words)
+    mp_blind_inverse_factor = worker.multiply(MB, None, vector.p, vector.p_coeff, pq_num_words, reduce_only=True) # mod_reduce (mod p)
+    mq_blind_inverse_factor = worker.multiply(MB, None, vector.q, vector.q_coeff, pq_num_words, reduce_only=True) # mod_reduce (mod q)
 
-    sp_blind_factor              = worker.exponentiate(ip_factor, mp_blind_factor, vector.dp, vector.p, vector.p_factor, vector.p_coeff, pq_num_words)
-    sq_blind_factor              = worker.exponentiate(iq_factor, mq_blind_factor, vector.dq, vector.q, vector.q_factor, vector.q_coeff, pq_num_words)
+    mp_blind = worker.multiply(mp_blind_inverse_factor, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    mq_blind = worker.multiply(mq_blind_inverse_factor, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    if worker.multiplier._a_seen_17:
-        print("17-bit wide A's seen.")
-    else:
-        print("17-bit wide A's not detected.")
+    mp_blind_factor = worker.multiply(mp_blind, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    mq_blind_factor = worker.multiply(mq_blind, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    if worker.multiplier._b_seen_17:
-        print("17-bit wide B's seen.")
-    else:
-        print("17-bit wide B's not detected.")
+    ip_factor = worker.multiply(i, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    iq_factor = worker.multiply(i, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-        
+    sp_blind_factor = worker.exponentiate(ip_factor, mp_blind_factor, vector.dp, vector.p, vector.p_factor, vector.p_coeff, pq_num_words, dump_index=99, dump_mode="P") # mod_multiply
+    sq_blind_factor = worker.exponentiate(iq_factor, mq_blind_factor, vector.dq, vector.q, vector.q_factor, vector.q_coeff, pq_num_words, dump_index=99, dump_mode="Q") # mod_multiply
 
-    sp_blind                     = worker.multiply(i,                            sp_blind_factor,  vector.p, vector.p_coeff, pq_num_words)
-    sq_blind                     = worker.multiply(i,                            sq_blind_factor,  vector.q, vector.q_coeff, pq_num_words)
+    SPB = worker.multiply(i, sp_blind_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    SQB = worker.multiply(i, sq_blind_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    sr_blind                     = worker.subtract(sp_blind, sq_blind, vector.p, pq_num_words)
+    worker.reduce(SPB) # just_reduce
+    worker.reduce(SQB) # just_reduce
 
-    sr_qinv_blind_inverse_factor = worker.multiply(sr_blind,                     vector.qinv,      vector.p, vector.p_coeff, pq_num_words)
-    sr_qinv_blind                = worker.multiply(sr_qinv_blind_inverse_factor, vector.p_factor,  vector.p, vector.p_coeff, pq_num_words)
-    q_sr_qinv_blind              = worker.multiply(vector.q,                     sr_qinv_blind,    None,     None,           pq_num_words, multiply_only=True)
+    sr_blind = worker.subtract(SPB, SQB, vector.p, pq_num_words) # mod_subtract
 
-    worker.reduce(q_sr_qinv_blind)
+    sr_qinv_blind_inverse_factor = worker.multiply(sr_blind, vector.qinv, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    sr_qinv_blind = worker.multiply(sr_qinv_blind_inverse_factor, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
     
-    s_crt_blinded                = worker.add(sq_blind, q_sr_qinv_blind, pq_num_words)
+    q_sr_qinv_blind = worker.multiply(vector.q, sr_qinv_blind, None, None, pq_num_words, multiply_only=True) # just_multiply
 
-    s_crt_unblinded              = worker.multiply(s_crt_blinded,                x_factor,         vector.n, vector.n_coeff, n_num_words)
-
-    x_mutated_factor             = worker.multiply(x_factor,                     x_factor,         vector.n, vector.n_coeff, n_num_words)
-    y_mutated_factor             = worker.multiply(y_factor,                     y_factor,         vector.n, vector.n_coeff, n_num_words)
+    worker.reduce(q_sr_qinv_blind) # just_reduce
+    
+    SB = worker.add(SQB, q_sr_qinv_blind, pq_num_words) # just_add
 
-    x_mutated                    = worker.multiply(i,                            x_mutated_factor, vector.n, vector.n_coeff, n_num_words)
-    y_mutated                    = worker.multiply(i,                            y_mutated_factor, vector.n, vector.n_coeff, n_num_words)
+    S = worker.multiply(SB, XF, vector.n, vector.n_coeff, n_num_words) # mod_multiply
 
-    worker.reduce(s_crt_unblinded)
-    worker.reduce(x_mutated)
-    worker.reduce(y_mutated)
+    worker.reduce(S) # just_reduce
+    worker.reduce(XM) # just_reduce
+    worker.reduce(YM) # just_reduce
 
     # check
-    if s_crt_unblinded.number() != s_known:   print("ERROR: s_crt_unblinded != s_known!")
+    if S.number() != s_known:   print("ERROR: s_crt_unblinded != s_known!")
     else:                                     print("s is OK")
 
-    if x_mutated.number() != x_mutated_known: print("ERROR: x_mutated != x_mutated_known!")
+    if XM.number() != x_mutated_known: print("ERROR: x_mutated != x_mutated_known!")
     else:                                     print("x_mutated is OK")
 
-    if y_mutated.number() != y_mutated_known: print("ERROR: y_mutated != y_mutated_known!")
+    if YM.number() != y_mutated_known: print("ERROR: y_mutated != y_mutated_known!")
     else:                                     print("y_mutated is OK")
 
 


From git at cryptech.is  Mon Aug 19 11:07:11 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:11 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 09/12: * Started
 conversion of the model to use micro-operations
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110710.B7BDB992A52@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit 66be583469258524ada60db7c7d134329f7f4dd1
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Aug 19 13:48:44 2019 +0300

    * Started conversion of the model to use micro-operations
    
    * Added initial operand bank structure (working "wide"/"narrow" pairs plus
      input & output banks). The core has four pairs of working banks (X.X and X.Y
      for Montgomery ladder with modulus P, Y.X and Y.Y for modulus Q)
---
 modexpng_fpga_model.py | 335 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 285 insertions(+), 50 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index cc3e868..4ef6576 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -41,6 +41,7 @@
 
 import sys
 import importlib
+from enum import Enum, auto
 
 
 # --------------
@@ -63,6 +64,7 @@ _KEY_LENGTH_HALF = KEY_LENGTH // 2
 
 # width of internal math pipeline
 _WORD_WIDTH = 16
+_WORD_WIDTH_EXT = 18
 
 # folder with test vector scripts
 _VECTOR_PATH = "/vector"
@@ -122,7 +124,7 @@ class ModExpNG_Operand():
         for i in range(count):
 
             # word must not exceed 18 bits
-            if words[i] >= (2 ** (_WORD_WIDTH + 2)):
+            if words[i] >= (2 ** (_WORD_WIDTH_EXT)):
                 raise Exception("Word is too large!")
 
         self.words = words
@@ -158,7 +160,7 @@ class ModExpNG_Operand():
             ret += word << shift
             shift += _WORD_WIDTH
         return ret
-
+                
 
 #
 # Test Vector
@@ -200,6 +202,118 @@ class ModExpNG_TestVector():
         self.x        = ModExpNG_Operand(vector_inst.x,         KEY_LENGTH)
         self.y        = ModExpNG_Operand(vector_inst.y,         KEY_LENGTH)
 
+class ModExpNG_WideBankEnum(Enum):
+    A   = auto()
+    B   = auto()
+    C   = auto()
+    D   = auto()
+    E   = auto()
+    N   = auto()
+    
+class ModExpNG_NarrowBankEnum(Enum):
+    A       = auto()
+    B       = auto()
+    C       = auto()
+    D       = auto()
+    E       = auto()
+    N_COEFF = auto()
+    I       = auto()
+        
+class ModExpNG_WideBank():
+
+    def __init__(self):
+        self.a   = None
+        self.b   = None
+        self.c   = None
+        self.d   = None
+        self.e   = None
+        self.n   = None
+    
+    def _get_value(self, sel):
+        if   sel == ModExpNG_WideBankEnum.A:   return self.a
+        elif sel == ModExpNG_WideBankEnum.B:   return self.b
+        elif sel == ModExpNG_WideBankEnum.C:   return self.c
+        elif sel == ModExpNG_WideBankEnum.D:   return self.d
+        elif sel == ModExpNG_WideBankEnum.E:   return self.e
+        elif sel == ModExpNG_WideBankEnum.N:   return self.n
+        else: raise Exception("ModExpNG_WideBank._get_value(): Invalid selector!")
+
+    def _set_value(self, sel, value):
+        if   sel == ModExpNG_WideBankEnum.A:   self.a   = value
+        elif sel == ModExpNG_WideBankEnum.B:   self.b   = value
+        elif sel == ModExpNG_WideBankEnum.C:   self.c   = value
+        elif sel == ModExpNG_WideBankEnum.D:   self.d   = value
+        elif sel == ModExpNG_WideBankEnum.E:   self.e   = value
+        elif sel == ModExpNG_WideBankEnum.N:   self.n   = value
+        else: raise Exception("ModExpNG_WideBank._set_value(): Invalid selector!")
+
+class ModExpNG_NarrowBank():
+
+    def __init__(self, i):
+        self.a       = None
+        self.b       = None
+        self.c       = None
+        self.d       = None
+        self.e       = None
+        self.n_coeff = None
+        self.i       = i
+        
+    def _get_value(self, sel):
+        if   sel == ModExpNG_NarrowBankEnum.A:       return self.a
+        elif sel == ModExpNG_NarrowBankEnum.B:       return self.b
+        elif sel == ModExpNG_NarrowBankEnum.C:       return self.c
+        elif sel == ModExpNG_NarrowBankEnum.D:       return self.d
+        elif sel == ModExpNG_NarrowBankEnum.E:       return self.e
+        elif sel == ModExpNG_NarrowBankEnum.N_COEFF: return self.n_coeff
+        elif sel == ModExpNG_NarrowBankEnum.I:       return self.i
+        else: raise Exception("ModExpNG_NarrowBank._get_value(): Invalid selector!")
+
+    def _set_value(self, sel, value):
+        if   sel == ModExpNG_NarrowBankEnum.A:       self.a       = value
+        elif sel == ModExpNG_NarrowBankEnum.B:       self.b       = value
+        elif sel == ModExpNG_NarrowBankEnum.C:       self.c       = value
+        elif sel == ModExpNG_NarrowBankEnum.D:       self.d       = value
+        elif sel == ModExpNG_NarrowBankEnum.E:       self.e       = value
+        elif sel == ModExpNG_NarrowBankEnum.N_COEFF: self.n_coeff = value
+        else: raise Exception("ModExpNG_NarrowBank._set_value(): Invalid selector!")
+
+class ModExpNG_BanksPair():
+    
+    def __init__(self, i):
+        self.wide = ModExpNG_WideBank()
+        self.narrow = ModExpNG_NarrowBank(i)
+        
+    def _get_value_wide(self, sel):
+        return self.wide._get_value(sel)
+
+    def _get_value_narrow(self, sel):
+        return self.narrow._get_value(sel)
+
+class ModExpNG_BanksLadder():
+
+    def __init__(self, i):
+        self.ladder_x = ModExpNG_BanksPair(i)
+        self.ladder_y = ModExpNG_BanksPair(i)
+        
+    def set_modulus(self, n, n_coeff):
+        self.ladder_x.wide._set_value(ModExpNG_WideBankEnum.N, n)
+        self.ladder_y.wide._set_value(ModExpNG_WideBankEnum.N, n)
+        self.ladder_x.narrow._set_value(ModExpNG_NarrowBankEnum.N_COEFF, n_coeff)
+        self.ladder_y.narrow._set_value(ModExpNG_NarrowBankEnum.N_COEFF, n_coeff)
+        
+    def set_operand(self, sel_wide, sel_narrow, x, y):
+        if sel_wide is not None:
+            self.ladder_x.wide._set_value(sel_wide, x)
+            self.ladder_y.wide._set_value(sel_wide, y)
+        if sel_narrow is not None:
+            self.ladder_x.narrow._set_value(sel_narrow, x)
+            self.ladder_y.narrow._set_value(sel_narrow, y)
+
+class ModExpNG_BanksCRT():
+
+    def __init__(self, i):
+        self.crt_x = ModExpNG_BanksLadder(i)
+        self.crt_y = ModExpNG_BanksLadder(i)
 
 class ModExpNG_PartRecombinator():
 
@@ -348,7 +462,6 @@ class ModExpNG_PartRecombinator():
 
         return words
 
-
 class ModExpNG_WordMultiplier():
 
     def __init__(self):
@@ -641,7 +754,6 @@ class ModExpNG_WordMultiplier():
 
         return parts
 
-
 class ModExpNG_LowlevelOperator():
 
     def __init__(self):
@@ -687,7 +799,6 @@ class ModExpNG_LowlevelOperator():
 
         return (dif_b, dif_d)
 
-
 class ModExpNG_Worker():
 
     def __init__(self):
@@ -888,15 +999,109 @@ class ModExpNG_Worker():
                 print(" = 0x%05x" % R[i])
                         
         return ModExpNG_Operand(None, ab_num_words, R)
-
-    def reduce(self, a):
+    
+    def reduce(self, a, num_words):
         carry = 0
-        for x in range(len(a.words)):
+        for x in range(num_words):
             a.words[x] += carry
             carry = (a.words[x] >> _WORD_WIDTH) & 3
             a.words[x] &= self.lowlevel._word_mask
 
+class ModExpNG_CoreOutputEnum(Enum):
+    XM = auto()
+    YM = auto()
+    S  = auto()
+            
+class ModExpNG_CoreOutput():
+    
+    def __init__(self):
+        self._xm = None
+        self._ym = None
+        self._s  = None
+        
+    def _set_value(self, sel, value):
+        if   sel == ModExpNG_CoreOutputEnum.XM: self._xm = value
+        elif sel == ModExpNG_CoreOutputEnum.YM: self._ym = value
+        elif sel == ModExpNG_CoreOutputEnum.S:  self._s  = value
+        else: raise Exception("ModExpNG_CoreOutput._set_value(): invalid selector!")
+        
+    def get_value(self, sel):
+        if   sel == ModExpNG_CoreOutputEnum.XM: return self._xm
+        elif sel == ModExpNG_CoreOutputEnum.YM: return self._ym
+        elif sel == ModExpNG_CoreOutputEnum.S:  return self._s
+        else: raise Exception("ModExpNG_CoreOutput.get_value(): invalid selector!")
+            
+class ModExpNG_Core():
+    
+    def __init__(self, i):
+        self.wrk = ModExpNG_Worker()
+        self.bnk = ModExpNG_BanksCRT(i)
+        self.out = ModExpNG_CoreOutput()
+        
+    def multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True)):
+        
+        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        
+        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        
+        xxa       = self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in)
+        xya       = self.bnk.crt_x.ladder_y.wide._get_value(sel_wide_in)
+
+        yxa       = self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in)
+        yya       = self.bnk.crt_y.ladder_y.wide._get_value(sel_wide_in)
+        
+        xxb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
+        xyb       = self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow_in)
+
+        yxb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
+        yyb       = self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow_in)
+        
+        if not mode[0]: xb = xxb
+        else:           xb = xyb
+
+        if not mode[1]: yb = yxb
+        else:           yb = yyb
+
+        xxp = self.wrk.multiply(xxa, xb, xn, xn_coeff, num_words)
+        xyp = self.wrk.multiply(xya, xb, xn, xn_coeff, num_words)
+
+        yxp = self.wrk.multiply(yxa, yb, yn, yn_coeff, num_words)
+        yyp = self.wrk.multiply(yya, yb, yn, yn_coeff, num_words)
+        
+        if sel_wide_out is not None:
+            self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xxp)
+            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, xyp)
+            self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yxp)
+            self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, yyp)
+        
+        if sel_narrow_out is not None:
+            self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xxp)
+            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, xyp)
+            self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yxp)
+            self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yyp)
+
+    def simply_reduce(self, sel_narrow, num_words):
+        self.wrk.reduce(self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow), num_words)
+        self.wrk.reduce(self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow), num_words)
+        self.wrk.reduce(self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow), num_words)
+        self.wrk.reduce(self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow), num_words)
+        
+    def set_output(self, sel_output, banks_ladder, sel_narrow):
+        self.out._set_value(sel_output, banks_ladder.ladder_x.narrow._get_value(sel_narrow))
+    
+    def mirror_yx(self, sel_wide, sel_narrow):
+    
+        if sel_wide is not None:
+            self.bnk.crt_x.ladder_x.wide._set_value(sel_wide, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide))
+            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide, self.bnk.crt_y.ladder_y.wide._get_value(sel_wide))
 
+        if sel_narrow is not None:
+            self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow))
+            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow, self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow))
+        
+        
 if __name__ == "__main__":
 
     # load test vector
@@ -906,16 +1111,16 @@ if __name__ == "__main__":
     # create helper quantity
     # mutate blinding quantities with built-in math
 
-    vector = ModExpNG_TestVector()
-    worker = ModExpNG_Worker()
-
     n_num_words  = KEY_LENGTH  // _WORD_WIDTH
     pq_num_words = n_num_words // 2
 
-    s_known  = pow(vector.m.number(), vector.d.number(), vector.n.number())
-
     i = ModExpNG_Operand(1, KEY_LENGTH)
 
+    vector = ModExpNG_TestVector()
+    core   = ModExpNG_Core(i)
+    
+    s_known = pow(vector.m.number(), vector.d.number(), vector.n.number())
+
     x_mutated_known = pow(vector.x.number(), 2, vector.n.number())
     y_mutated_known = pow(vector.y.number(), 2, vector.n.number())
 
@@ -936,67 +1141,97 @@ if __name__ == "__main__":
     #  s_crt = sq + q_sr_qinv
     # unblind s
     # mutate blinding factors
-    
-    XF  = worker.multiply(vector.x, vector.n_factor, vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
-    YF  = worker.multiply(vector.y, vector.n_factor, vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
 
-    XMF = worker.multiply(XF,       XF,              vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
-    YMF = worker.multiply(YF,       YF,              vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
+    W = ModExpNG_WideBankEnum
+    N = ModExpNG_NarrowBankEnum
+    O = ModExpNG_CoreOutputEnum
     
-    XM  = worker.multiply(i,        XMF,             vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
-    YM  = worker.multiply(i,        YMF,             vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
+    core.bnk.crt_x.set_modulus(vector.n, vector.n_coeff)
+    core.bnk.crt_y.set_modulus(vector.n, vector.n_coeff)
+    
+    core.bnk.crt_x.set_operand(W.A, N.A, vector.x, vector.n_factor)
+    core.bnk.crt_y.set_operand(W.A, N.A, vector.y, vector.n_factor)
+
+    core.bnk.crt_x.set_operand(W.E, N.E, vector.m, vector.m)
+    core.bnk.crt_y.set_operand(W.E, N.E, vector.m, vector.m)
+
+    #   | W   | N
+    # --+-----+-----------
+    # A | 
+    # B | ?   | ?
+    # C | ?   | ?
+    # D | ?   | ?
+    # E | M   | M
+
+                                                                        #                          | A              | B     | C       | D     | E |
+                                                                        #                          +----------------+-------+---------+-------+---+
+                                                                        # (YF, XF) =(Y,X)*N_FACTOR | X,Y ; N_FACTOR | ?     | ?       | ?     | M |
+    core.multiply(W.A, N.A, W.B, N.B, n_num_words)                      # (YF, XF) =(Y,X)*N_FACTOR | X,Y ; N_FACTOR | XF,YF | ?       | ?     | M |
+    core.multiply(W.B, N.B, W.C, N.C, n_num_words, mode=(False, False)) # (YMF,XMF)=(YF*YF,XF*XF)  | X,Y ; N_FACTOR | XF,YF | YMF,XMF | ?     | M |
+    core.multiply(W.C, N.I, W.D, N.D, n_num_words)                      # (YM, XM) =(YMF,XMF)*1    | X,Y ; N_FACTOR | XF,YF | YMF,XMF | XM,YM | M |
+    core.simply_reduce(N.D, n_num_words)                                #                          |                |       |         |       |   |
+    core.set_output(O.XM, core.bnk.crt_x, N.D)                          #                          |                |       |         |       |   |
+    core.set_output(O.YM, core.bnk.crt_y, N.D)                          #                          |                |       |         |       |   |
+    core.multiply(W.E, N.B, W.C, N.C, n_num_words, mode=(False, False)) # (MB, _)  =(M*YF,M*XF)    | X,Y ; N_FACTOR | XF,YF | MB,_    | XM,YM | M |
+    core.mirror_yx(W.C, N.C)                                            #                          | X,Y ; N_FACTOR | XF,YF | MB,MB   | XM,YM | M |
+    core.simply_reduce(N.C, n_num_words)                                #                          |                |       |         |       |   |
+
+
+    XF = core.bnk.crt_x.ladder_x.wide._get_value(W.B)
+    YF = core.bnk.crt_y.ladder_x.wide._get_value(W.B)
+        
+    MB = core.bnk.crt_y.ladder_x.narrow._get_value(N.C)
 
-    MB  = worker.multiply(vector.m, YF,              vector.n, vector.n_coeff, n_num_words) # mod_multiply (mod n)
+    PMBZ = core.wrk.multiply(MB, None, vector.p, vector.p_coeff, pq_num_words, reduce_only=True) # mod_reduce (mod p)
+    QMBZ = core.wrk.multiply(MB, None, vector.q, vector.q_coeff, pq_num_words, reduce_only=True) # mod_reduce (mod q)
 
-    worker.reduce(MB) # just_reduce
+    mp_blind = core.wrk.multiply(PMBZ, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    mq_blind = core.wrk.multiply(QMBZ, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    mp_blind_inverse_factor = worker.multiply(MB, None, vector.p, vector.p_coeff, pq_num_words, reduce_only=True) # mod_reduce (mod p)
-    mq_blind_inverse_factor = worker.multiply(MB, None, vector.q, vector.q_coeff, pq_num_words, reduce_only=True) # mod_reduce (mod q)
+    mp_blind_factor = core.wrk.multiply(mp_blind, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    mq_blind_factor = core.wrk.multiply(mq_blind, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    mp_blind = worker.multiply(mp_blind_inverse_factor, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    mq_blind = worker.multiply(mq_blind_inverse_factor, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
+    ip_factor = core.wrk.multiply(i, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    iq_factor = core.wrk.multiply(i, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    mp_blind_factor = worker.multiply(mp_blind, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    mq_blind_factor = worker.multiply(mq_blind, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    ip_factor = worker.multiply(i, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    iq_factor = worker.multiply(i, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    sp_blind_factor = worker.exponentiate(ip_factor, mp_blind_factor, vector.dp, vector.p, vector.p_factor, vector.p_coeff, pq_num_words, dump_index=99, dump_mode="P") # mod_multiply
-    sq_blind_factor = worker.exponentiate(iq_factor, mq_blind_factor, vector.dq, vector.q, vector.q_factor, vector.q_coeff, pq_num_words, dump_index=99, dump_mode="Q") # mod_multiply
+    sp_blind_factor = core.wrk.exponentiate(ip_factor, mp_blind_factor, vector.dp, vector.p, vector.p_factor, vector.p_coeff, pq_num_words, dump_index=99, dump_mode="P") # mod_multiply
+    sq_blind_factor = core.wrk.exponentiate(iq_factor, mq_blind_factor, vector.dq, vector.q, vector.q_factor, vector.q_coeff, pq_num_words, dump_index=99, dump_mode="Q") # mod_multiply
 
-    SPB = worker.multiply(i, sp_blind_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    SQB = worker.multiply(i, sq_blind_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
+    SPB = core.wrk.multiply(i, sp_blind_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    SQB = core.wrk.multiply(i, sq_blind_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    worker.reduce(SPB) # just_reduce
-    worker.reduce(SQB) # just_reduce
+    core.wrk.reduce(SPB, len(SPB.words)) # just_reduce
+    core.wrk.reduce(SQB, len(SQB.words)) # just_reduce
 
-    sr_blind = worker.subtract(SPB, SQB, vector.p, pq_num_words) # mod_subtract
+    sr_blind = core.wrk.subtract(SPB, SQB, vector.p, pq_num_words) # mod_subtract
 
-    sr_qinv_blind_inverse_factor = worker.multiply(sr_blind, vector.qinv, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    sr_qinv_blind = worker.multiply(sr_qinv_blind_inverse_factor, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    sr_qinv_blind_inverse_factor = core.wrk.multiply(sr_blind, vector.qinv, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    sr_qinv_blind = core.wrk.multiply(sr_qinv_blind_inverse_factor, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
     
-    q_sr_qinv_blind = worker.multiply(vector.q, sr_qinv_blind, None, None, pq_num_words, multiply_only=True) # just_multiply
+    q_sr_qinv_blind = core.wrk.multiply(vector.q, sr_qinv_blind, None, None, pq_num_words, multiply_only=True) # just_multiply
 
-    worker.reduce(q_sr_qinv_blind) # just_reduce
+    core.wrk.reduce(q_sr_qinv_blind, n_num_words) # just_reduce
     
-    SB = worker.add(SQB, q_sr_qinv_blind, pq_num_words) # just_add
+    SB = core.wrk.add(SQB, q_sr_qinv_blind, pq_num_words) # just_add
 
-    S = worker.multiply(SB, XF, vector.n, vector.n_coeff, n_num_words) # mod_multiply
+    S = core.wrk.multiply(SB, XF, vector.n, vector.n_coeff, n_num_words) # mod_multiply
 
-    worker.reduce(S) # just_reduce
-    worker.reduce(XM) # just_reduce
-    worker.reduce(YM) # just_reduce
+    core.wrk.reduce(S, len(S.words)) # just_reduce
 
     # check
-    if S.number() != s_known:   print("ERROR: s_crt_unblinded != s_known!")
-    else:                                     print("s is OK")
+    XM = core.out.get_value(O.XM)
+    YM = core.out.get_value(O.YM)
+    
+    if S.number() != s_known: print("ERROR: s_crt_unblinded != s_known!")
+    else:                     print("s is OK")
 
     if XM.number() != x_mutated_known: print("ERROR: x_mutated != x_mutated_known!")
-    else:                                     print("x_mutated is OK")
+    else:                              print("x_mutated is OK")
 
     if YM.number() != y_mutated_known: print("ERROR: y_mutated != y_mutated_known!")
-    else:                                     print("y_mutated is OK")
+    else:                              print("y_mutated is OK")
 
 
 #


From git at cryptech.is  Mon Aug 19 11:07:12 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:12 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 10/12: * Added more
 micro-operations
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110711.465FA992A55@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit a5200cd8af68cca49978bfa023b704ad5431eae1
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Aug 19 13:57:12 2019 +0300

    * Added more micro-operations
    
    * Working microcode for CRT exponentiation
    
    * Further refactoring
---
 modexpng_fpga_model.py | 372 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 295 insertions(+), 77 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index 4ef6576..f57c7b9 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -160,7 +160,18 @@ class ModExpNG_Operand():
             ret += word << shift
             shift += _WORD_WIDTH
         return ret
-                
+        
+    def _get_half(self, part):
+        num_words = len(self.words)
+        num_words_half = num_words // 2
+        if not part: return ModExpNG_Operand(None, num_words_half, self.words[:num_words_half])
+        else:        return ModExpNG_Operand(None, num_words_half, self.words[num_words_half:])
+        
+    def lower_half(self):
+        return self._get_half(False)
+        
+    def upper_half(self):
+        return self._get_half(True)
 
 #
 # Test Vector
@@ -209,6 +220,8 @@ class ModExpNG_WideBankEnum(Enum):
     D   = auto()
     E   = auto()
     N   = auto()
+    L   = auto()
+    H   = auto()
     
 class ModExpNG_NarrowBankEnum(Enum):
     A       = auto()
@@ -222,12 +235,14 @@ class ModExpNG_NarrowBankEnum(Enum):
 class ModExpNG_WideBank():
 
     def __init__(self):
-        self.a   = None
-        self.b   = None
-        self.c   = None
-        self.d   = None
-        self.e   = None
-        self.n   = None
+        self.a = None
+        self.b = None
+        self.c = None
+        self.d = None
+        self.e = None
+        self.n = None
+        self.l = None
+        self.h = None
     
     def _get_value(self, sel):
         if   sel == ModExpNG_WideBankEnum.A:   return self.a
@@ -236,6 +251,8 @@ class ModExpNG_WideBank():
         elif sel == ModExpNG_WideBankEnum.D:   return self.d
         elif sel == ModExpNG_WideBankEnum.E:   return self.e
         elif sel == ModExpNG_WideBankEnum.N:   return self.n
+        elif sel == ModExpNG_WideBankEnum.L:   return self.l
+        elif sel == ModExpNG_WideBankEnum.H:   return self.h
         else: raise Exception("ModExpNG_WideBank._get_value(): Invalid selector!")
 
     def _set_value(self, sel, value):
@@ -245,6 +262,8 @@ class ModExpNG_WideBank():
         elif sel == ModExpNG_WideBankEnum.D:   self.d   = value
         elif sel == ModExpNG_WideBankEnum.E:   self.e   = value
         elif sel == ModExpNG_WideBankEnum.N:   self.n   = value
+        elif sel == ModExpNG_WideBankEnum.L:   self.l   = value
+        elif sel == ModExpNG_WideBankEnum.H:   self.h   = value
         else: raise Exception("ModExpNG_WideBank._set_value(): Invalid selector!")
 
 class ModExpNG_NarrowBank():
@@ -283,11 +302,17 @@ class ModExpNG_BanksPair():
         self.wide = ModExpNG_WideBank()
         self.narrow = ModExpNG_NarrowBank(i)
         
-    def _get_value_wide(self, sel):
+    def _get_wide(self, sel):
         return self.wide._get_value(sel)
 
-    def _get_value_narrow(self, sel):
+    def _get_narrow(self, sel):
         return self.narrow._get_value(sel)
+        
+    def _set_wide(self, sel, value):
+        self.wide._set_value(sel, value)
+        
+    def _set_narrow(self, sel, value):
+        self.narrow._set_value(sel, value)
 
 class ModExpNG_BanksLadder():
 
@@ -301,12 +326,12 @@ class ModExpNG_BanksLadder():
         self.ladder_x.narrow._set_value(ModExpNG_NarrowBankEnum.N_COEFF, n_coeff)
         self.ladder_y.narrow._set_value(ModExpNG_NarrowBankEnum.N_COEFF, n_coeff)
         
-    def set_operand(self, sel_wide, sel_narrow, x, y):
+    def set_operands_crt_xy(self, sel_wide, sel_narrow, x, y):
         if sel_wide is not None:
             self.ladder_x.wide._set_value(sel_wide, x)
-            self.ladder_y.wide._set_value(sel_wide, y)
+            self.ladder_y.wide._set_value(sel_wide, x)
         if sel_narrow is not None:
-            self.ladder_x.narrow._set_value(sel_narrow, x)
+            self.ladder_x.narrow._set_value(sel_narrow, y)
             self.ladder_y.narrow._set_value(sel_narrow, y)
 
 class ModExpNG_BanksCRT():
@@ -814,9 +839,20 @@ class ModExpNG_Worker():
         # length-1, length-2, length-3, ..., 1, 0 (left-to-right)
         for bit in range(_WORD_WIDTH * num_words - 1, -1, -1):
 
+            bit_value = (e.number() & (1 << bit)) >> bit
+
+            if bit > 500:
+                print("%s: bit=#%d (%d)" % (dump_mode, bit, bit_value))
+                print("")
+                print("%s_T1_BEFORE: %s" % (dump_mode, hex(t1.number())))
+                print("%s_T2_BEFORE: %s" % (dump_mode, hex(t2.number())))
+                print("")
+            else:
+                return None
+
+
             debug_dump = bit == dump_index
 
-            bit_value = (e.number() & (1 << bit)) >> bit
             
             if debug_dump:
                 print("\rladder_mode = %d" % bit_value)
@@ -855,7 +891,7 @@ class ModExpNG_Worker():
                 t1.format_verilog_concat("%s_X" % dump_mode)
                 t2.format_verilog_concat("%s_Y" % dump_mode)
 
-            if (bit % 8) == 0:
+            if (bit % 16) == 0:
                 pct = float((_WORD_WIDTH * num_words - bit) / (_WORD_WIDTH * num_words)) * 100.0
                 print("\rpct: %5.1f%%" % pct, end='')
         
@@ -913,7 +949,7 @@ class ModExpNG_Worker():
         if dump: print("multiply_square(%s_%s)" % (dump_mode, dump_phase))
         
         if reduce_only:
-            ab = a
+            ab = b
         else:
             ab_parts = self.multiplier.multiply_square(a, b, ab_num_words, dump)
             ab_words = self.recombinator.recombine_square(ab_parts, ab_num_words, dump)
@@ -1000,12 +1036,13 @@ class ModExpNG_Worker():
                         
         return ModExpNG_Operand(None, ab_num_words, R)
     
-    def reduce(self, a, num_words):
-        carry = 0
+    def reduce(self, a, num_words, carry_in=0):
+        carry = carry_in
         for x in range(num_words):
             a.words[x] += carry
             carry = (a.words[x] >> _WORD_WIDTH) & 3
             a.words[x] &= self.lowlevel._word_mask
+        return carry
 
 class ModExpNG_CoreOutputEnum(Enum):
     XM = auto()
@@ -1038,7 +1075,7 @@ class ModExpNG_Core():
         self.bnk = ModExpNG_BanksCRT(i)
         self.out = ModExpNG_CoreOutput()
         
-    def multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True)):
+    def modular_multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True)):
         
         xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
         yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
@@ -1082,16 +1119,82 @@ class ModExpNG_Core():
             self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yxp)
             self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yyp)
 
-    def simply_reduce(self, sel_narrow, num_words):
+    def modular_subtract(self, sel_narrow_in, sel_narrow_out, sel_wide_out, num_words):
+
+        xa = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
+        xb = self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow_in)
+        xn = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+
+        ya = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
+        yb = self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow_in)
+        yn = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        
+        xd = self.wrk.subtract(xa, xb, xn, num_words)
+        yd = self.wrk.subtract(ya, yb, yn, num_words)
+        
+        self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xd)
+        self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yd)
+
+        self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xd)
+        self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yd)
+        
+    def reduce_narrow(self, sel_narrow, num_words):
         self.wrk.reduce(self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow), num_words)
         self.wrk.reduce(self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow), num_words)
         self.wrk.reduce(self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow), num_words)
         self.wrk.reduce(self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow), num_words)
+
+    def merge_lha(self, sel_narrow, num_words):
+        xx_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
+        xy_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
+        yx_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
+        yy_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
+        
+        xx_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
+        xy_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
+        yx_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
+        yy_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
+        
+        xx = xx_lsb.words + xx_msb.words
+        xy = xy_lsb.words + xy_msb.words
+        yx = yx_lsb.words + yx_msb.words
+        yy = yy_lsb.words + yy_msb.words
+        
+        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xx))
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xy))
+        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yx))
+        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yy))
+
+    def modular_reduce(self, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words):
+    
+        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        
+        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        
+        xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
+        yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
         
+        xp = self.wrk.multiply(None, xb, xn, xn_coeff, num_words, reduce_only=True)
+        yp = self.wrk.multiply(None, yb, yn, yn_coeff, num_words, reduce_only=True)
+        
+        if sel_wide_out is not None:
+            self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xp)
+            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, xp)
+            self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yp)
+            self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, yp)
+        
+        if sel_narrow_out is not None:
+            self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xp)
+            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, xp)
+            self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yp)
+            self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yp)
+
     def set_output(self, sel_output, banks_ladder, sel_narrow):
         self.out._set_value(sel_output, banks_ladder.ladder_x.narrow._get_value(sel_narrow))
     
-    def mirror_yx(self, sel_wide, sel_narrow):
+    def move_crt_y2x(self, sel_wide, sel_narrow):
     
         if sel_wide is not None:
             self.bnk.crt_x.ladder_x.wide._set_value(sel_wide, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide))
@@ -1100,6 +1203,90 @@ class ModExpNG_Core():
         if sel_narrow is not None:
             self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow))
             self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow, self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow))
+
+    def move_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
+        
+        if sel_wide_out is not None:
+            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in))
+            self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in))
+
+        if sel_narrow_out is not None:
+            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in))
+            self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in))
+
+    def flip_ladder_y2x(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
+        
+        if sel_wide_out is not None:
+            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in))
+
+        if sel_narrow_out is not None:
+            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in))
+
+    def just_multiply(self, sel_wide_in, sel_narrow_in, num_words):
+
+        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        
+        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        
+        xxa       = self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in)
+        xya       = self.bnk.crt_x.ladder_y.wide._get_value(sel_wide_in)
+
+        yxa       = self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in)
+        yya       = self.bnk.crt_y.ladder_y.wide._get_value(sel_wide_in)
+        
+        xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
+        yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
+        
+        xxp = self.wrk.multiply(xxa, xb, None, None, num_words, multiply_only=True)
+        xyp = self.wrk.multiply(xya, xb, None, None, num_words, multiply_only=True)
+
+        yxp = self.wrk.multiply(yxa, yb, None, None, num_words, multiply_only=True)
+        yyp = self.wrk.multiply(yya, yb, None, None, num_words, multiply_only=True)
+        
+        xxp_lsb = xxp.lower_half()
+        xxp_msb = xxp.upper_half()
+
+        xyp_lsb = xyp.lower_half()
+        xyp_msb = xyp.upper_half()
+
+        yxp_lsb = yxp.lower_half()
+        yxp_msb = yxp.upper_half()
+
+        yyp_lsb = yyp.lower_half()
+        yyp_msb = yyp.upper_half()
+                
+        self.bnk.crt_x.ladder_x.wide._set_value(ModExpNG_WideBankEnum.L, xxp_lsb)
+        self.bnk.crt_x.ladder_y.wide._set_value(ModExpNG_WideBankEnum.L, xyp_lsb)
+        self.bnk.crt_y.ladder_x.wide._set_value(ModExpNG_WideBankEnum.L, yxp_lsb)
+        self.bnk.crt_y.ladder_y.wide._set_value(ModExpNG_WideBankEnum.L, yyp_lsb)
+
+        self.bnk.crt_x.ladder_x.wide._set_value(ModExpNG_WideBankEnum.H, xxp_msb)
+        self.bnk.crt_x.ladder_y.wide._set_value(ModExpNG_WideBankEnum.H, xyp_msb)
+        self.bnk.crt_y.ladder_x.wide._set_value(ModExpNG_WideBankEnum.H, yxp_msb)
+        self.bnk.crt_y.ladder_y.wide._set_value(ModExpNG_WideBankEnum.H, yyp_msb)
+    
+    def just_add(self, sel_narrow_a_in, sel_narrow_b_in, sel_narrow_out, num_words):
+        xxa = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_a_in)
+        xya = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_a_in)
+        yxa = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_a_in)
+        yya = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_a_in)
+
+        xxb = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_b_in)
+        xyb = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_b_in)
+        yxb = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_b_in)
+        yyb = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_b_in)
+        
+        xxc = self.wrk.add(xxa, xxb, num_words)
+        xyc = self.wrk.add(xya, xyb, num_words)
+        yxc = self.wrk.add(yxa, yxb, num_words)
+        yyc = self.wrk.add(yya, yyb, num_words)
+        
+        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xxc)
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xyc)
+        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yxc)
+        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yyc)
         
         
 if __name__ == "__main__":
@@ -1149,78 +1336,109 @@ if __name__ == "__main__":
     core.bnk.crt_x.set_modulus(vector.n, vector.n_coeff)
     core.bnk.crt_y.set_modulus(vector.n, vector.n_coeff)
     
-    core.bnk.crt_x.set_operand(W.A, N.A, vector.x, vector.n_factor)
-    core.bnk.crt_y.set_operand(W.A, N.A, vector.y, vector.n_factor)
-
-    core.bnk.crt_x.set_operand(W.E, N.E, vector.m, vector.m)
-    core.bnk.crt_y.set_operand(W.E, N.E, vector.m, vector.m)
-
-    #   | W   | N
-    # --+-----+-----------
-    # A | 
-    # B | ?   | ?
-    # C | ?   | ?
-    # D | ?   | ?
-    # E | M   | M
-
-                                                                        #                          | A              | B     | C       | D     | E |
-                                                                        #                          +----------------+-------+---------+-------+---+
-                                                                        # (YF, XF) =(Y,X)*N_FACTOR | X,Y ; N_FACTOR | ?     | ?       | ?     | M |
-    core.multiply(W.A, N.A, W.B, N.B, n_num_words)                      # (YF, XF) =(Y,X)*N_FACTOR | X,Y ; N_FACTOR | XF,YF | ?       | ?     | M |
-    core.multiply(W.B, N.B, W.C, N.C, n_num_words, mode=(False, False)) # (YMF,XMF)=(YF*YF,XF*XF)  | X,Y ; N_FACTOR | XF,YF | YMF,XMF | ?     | M |
-    core.multiply(W.C, N.I, W.D, N.D, n_num_words)                      # (YM, XM) =(YMF,XMF)*1    | X,Y ; N_FACTOR | XF,YF | YMF,XMF | XM,YM | M |
-    core.simply_reduce(N.D, n_num_words)                                #                          |                |       |         |       |   |
-    core.set_output(O.XM, core.bnk.crt_x, N.D)                          #                          |                |       |         |       |   |
-    core.set_output(O.YM, core.bnk.crt_y, N.D)                          #                          |                |       |         |       |   |
-    core.multiply(W.E, N.B, W.C, N.C, n_num_words, mode=(False, False)) # (MB, _)  =(M*YF,M*XF)    | X,Y ; N_FACTOR | XF,YF | MB,_    | XM,YM | M |
-    core.mirror_yx(W.C, N.C)                                            #                          | X,Y ; N_FACTOR | XF,YF | MB,MB   | XM,YM | M |
-    core.simply_reduce(N.C, n_num_words)                                #                          |                |       |         |       |   |
-
-
-    XF = core.bnk.crt_x.ladder_x.wide._get_value(W.B)
-    YF = core.bnk.crt_y.ladder_x.wide._get_value(W.B)
-        
-    MB = core.bnk.crt_y.ladder_x.narrow._get_value(N.C)
-
-    PMBZ = core.wrk.multiply(MB, None, vector.p, vector.p_coeff, pq_num_words, reduce_only=True) # mod_reduce (mod p)
-    QMBZ = core.wrk.multiply(MB, None, vector.q, vector.q_coeff, pq_num_words, reduce_only=True) # mod_reduce (mod q)
-
-    mp_blind = core.wrk.multiply(PMBZ, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    mq_blind = core.wrk.multiply(QMBZ, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
+    core.bnk.crt_x.set_operands_crt_xy(W.A, N.A, vector.x, vector.n_factor)
+    core.bnk.crt_y.set_operands_crt_xy(W.A, N.A, vector.y, vector.n_factor)
+
+    core.bnk.crt_x.set_operands_crt_xy(W.E, N.E, vector.m, vector.m)
+    core.bnk.crt_y.set_operands_crt_xy(W.E, N.E, vector.m, vector.m)
+                                                                                   #                             | A               | B     | C                | D       | E |
+                                                                                   #                             +-----------------+-------+------------------+---------+---+
+                                                                                   #                             | [XY] ; N_FACTOR | ?     | ?                | ?       | M |
+    core.modular_multiply(W.A, N.A, W.B, N.B, n_num_words)                         # [XY]F  =[XY]*N_FACTOR       | [XY] ; N_FACTOR | [XY]F | ?                | ?       | M |
+    core.modular_multiply(W.B, N.B, W.C, N.C, n_num_words, mode=(False, False))    # [XY]MF =[XY]F*[XY]F         | [XY] ; N_FACTOR | [XY]F | [XY]YM           | ?       | M |
+    core.modular_multiply(W.C, N.I, W.D, N.D, n_num_words)                         # [XY]M  =[XY]MF*1            | [XY] ; N_FACTOR | [XY]F | [XY]YM           | [XY]M   | M |
+    core.reduce_narrow(N.D, n_num_words)                                           #                             |                 |       |                  |         |   |
+    core.set_output(O.XM, core.bnk.crt_x, N.D)                                     #                             |                 |       |                  |         |   |
+    core.set_output(O.YM, core.bnk.crt_y, N.D)                                     #                             |                 |       |                  |         |   |
+    core.modular_multiply(W.E, N.B, W.C, N.C, n_num_words)                         # [XY]MB =M*[XY]F             | [XY] ; N_FACTOR | [XY]F | [XY]MB           | [XY]M   | M |
+    core.move_crt_y2x(W.C, N.C)                                                    #                             | [XY] ; N_FACTOR | [XY]F | YMB              | [XY]M   | M |
+    core.bnk.crt_x.set_modulus(vector.p, vector.p_coeff)                           #                             |                 |       |                  |         |   |
+    core.bnk.crt_y.set_modulus(vector.q, vector.q_coeff)                           #                             |                 |       |                  |         |   |
+    core.bnk.crt_x.set_operands_crt_xy(W.A, N.A, vector.p_factor, vector.p_factor) #                             | [PQ]_FACTOR     | [XY]F | YMB              | [XY]M   | M |
+    core.bnk.crt_y.set_operands_crt_xy(W.A, N.A, vector.q_factor, vector.q_factor) #                             | [PQ]_FACTOR     | [XY]F | YMB              | [XY]M   | M |
+    core.reduce_narrow(N.C, n_num_words)                                           #                             |                 |       |                  |         |   |
+    core.modular_reduce(N.C, W.D, N.D, pq_num_words)                               #                             | [PQ]_FACTOR     | [XY]F | YMB              | [PQ]MBZ | M |
+    core.modular_multiply(W.D, N.A, W.C, N.C, pq_num_words)                        # [PQ]MB =[PQ]MBZ*[PQ]_FACTOR | [PQ]_FACTOR     | [XY]F | [PQ]MB           | [PQ]MBZ | M |
+    core.modular_multiply(W.C, N.A, W.D, N.D, pq_num_words)                        # [PQ]MBF=[PQ]MB*[PQ]_FACTOR  | [PQ]_FACTOR     | [XY]F | [PQ]MB           | [PQ]MBF | M |
+    core.modular_multiply(W.A, N.I, W.C, N.C, pq_num_words)                        # [PQ]MBF=[PQ]MB*[PQ]_FACTOR  | [PQ]_FACTOR     | [XY]F | [PQ]IF           | [PQ]MBF | M |
+    core.move_ladders_x2y(W.D, N.D, W.C, N.C)                                      #                             | [PQ]_FACTOR     | [XY]F | [PQ]IF / [PQ]MBF | [PQ]MBF | M |
+    
+    #PIF = core.bnk.crt_x.ladder_x.narrow._get_value(N.C)#
+    #QIF = core.bnk.crt_y.ladder_x.narrow._get_value(N.C)#
+    
 
-    mp_blind_factor = core.wrk.multiply(mp_blind, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    mq_blind_factor = core.wrk.multiply(mq_blind, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
-    ip_factor = core.wrk.multiply(i, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    iq_factor = core.wrk.multiply(i, vector.q_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
 
+########################
 
+    for bit in range(_WORD_WIDTH * pq_num_words - 1, -1, -1):
+    
+        bit_value_p = (vector.dp.number() & (1 << bit)) >> bit
+        bit_value_q = (vector.dq.number() & (1 << bit)) >> bit
 
-    sp_blind_factor = core.wrk.exponentiate(ip_factor, mp_blind_factor, vector.dp, vector.p, vector.p_factor, vector.p_coeff, pq_num_words, dump_index=99, dump_mode="P") # mod_multiply
-    sq_blind_factor = core.wrk.exponentiate(iq_factor, mq_blind_factor, vector.dq, vector.q, vector.q_factor, vector.q_coeff, pq_num_words, dump_index=99, dump_mode="Q") # mod_multiply
+        bit_value_p = bit_value_p > 0
+        bit_value_q = bit_value_q > 0
+            
+        # mode = ... (shorted the next line for better readability)
+            
+        core.modular_multiply(W.C, N.C, W.C, N.C, pq_num_words, mode=(bit_value_p, bit_value_q))    # <LADDER>   | [PQ]_FACTOR     | [XY]F | [PQ]SBF          | [PQ]MBF | M |
+        
+        if (bit % 4) == 0:
+            pct = float((_WORD_WIDTH * pq_num_words - bit) / (_WORD_WIDTH * pq_num_words)) * 100.0
+            print("\rdone: %5.1f%%" % pct, end='')
+        
+    print("")
 
-    SPB = core.wrk.multiply(i, sp_blind_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    SQB = core.wrk.multiply(i, sq_blind_factor, vector.q, vector.q_coeff, pq_num_words) # mod_multiply
+    core.modular_multiply(W.C, N.I, W.D, N.D, pq_num_words)                        # [PQ]SB=[PQ]SBF*1            | [PQ]_FACTOR     | [XY]F | [PQ]SBF          | [PQ]SB  | M |
 
-    core.wrk.reduce(SPB, len(SPB.words)) # just_reduce
-    core.wrk.reduce(SQB, len(SQB.words)) # just_reduce
+############################
+    
+    core.reduce_narrow(N.D, pq_num_words)
+    
+    #SQB = core.bnk.crt_y.ladder_x.narrow._get_value(N.D)
+    
+    core.flip_ladder_y2x(W.D, N.D, W.D, N.D)
+    core.modular_subtract(N.D, N.C, W.C, pq_num_words)                     #                             | [PQ]_FACTOR     | [XY]F | RSB              | [PQ]SB  | M    |
+    core.bnk.crt_x.set_operands_crt_xy(W.E, N.E, vector.qinv, vector.qinv) #                             | [PQ]_FACTOR     | [XY]F | RSB              | [PQ]SB  | QINV |
+    core.bnk.crt_y.set_operands_crt_xy(W.E, N.E, vector.qinv, vector.qinv) #                             | [PQ]_FACTOR     | [XY]F | RSB              | [PQ]SB  | QINV |
 
-    sr_blind = core.wrk.subtract(SPB, SQB, vector.p, pq_num_words) # mod_subtract
+    core.modular_multiply(W.C, N.E, W.C, N.C, pq_num_words)                #                             | [PQ]_FACTOR     | [XY]F | RSBIZ            | [PQ]SB  | QINV |
+    core.modular_multiply(W.C, N.A, W.C, N.C, pq_num_words)                #                             | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | QINV |
+    
+    core.bnk.crt_x.set_operands_crt_xy(W.E, N.E, vector.q, vector.q) #                             | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | Q |
+    core.bnk.crt_y.set_operands_crt_xy(W.E, N.E, vector.q, vector.q) #                             | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | Q |
+    
+    core.just_multiply(W.E, N.C, pq_num_words)                            # | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | Q |
 
-    sr_qinv_blind_inverse_factor = core.wrk.multiply(sr_blind, vector.qinv, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
-    sr_qinv_blind = core.wrk.multiply(sr_qinv_blind_inverse_factor, vector.p_factor, vector.p, vector.p_coeff, pq_num_words) # mod_multiply
+    core.merge_lha(N.A, pq_num_words)
     
-    q_sr_qinv_blind = core.wrk.multiply(vector.q, sr_qinv_blind, None, None, pq_num_words, multiply_only=True) # just_multiply
+    core.reduce_narrow(N.A, n_num_words)
 
-    core.wrk.reduce(q_sr_qinv_blind, n_num_words) # just_reduce
+    core.move_crt_y2x(W.D, N.D)
     
-    SB = core.wrk.add(SQB, q_sr_qinv_blind, pq_num_words) # just_add
+    #RQSBI = core.bnk.crt_x.ladder_x.narrow._get_value(N.A)
+    
+    core.just_add(N.D, N.A, N.C, pq_num_words)   # 
+    SB = core.bnk.crt_x.ladder_x._get_narrow(N.C)
+    #print(hex(SB.number()))
 
-    S = core.wrk.multiply(SB, XF, vector.n, vector.n_coeff, n_num_words) # mod_multiply
+    #SB = core.wrk.add(SQB, RQSBI, pq_num_words) # just_add
+    #print(hex(SB.number()))
+    
+    
+    # check why multiplication is not commutative!?
+    
+    
+    XF = core.bnk.crt_x.ladder_x.wide._get_value(W.B)
 
-    core.wrk.reduce(S, len(S.words)) # just_reduce
+    core.bnk.crt_x.set_modulus(vector.n, vector.n_coeff)
+    core.bnk.crt_y.set_modulus(vector.n, vector.n_coeff)
 
-    # check
+    core.modular_multiply(W.B, N.C, W.A, N.A, n_num_words, mode=(False, False))
+    core.reduce_narrow(N.A, n_num_words)    
+    core.set_output(O.S, core.bnk.crt_x, N.A)
+    
+    S  = core.out.get_value(O.S)
     XM = core.out.get_value(O.XM)
     YM = core.out.get_value(O.YM)
     


From git at cryptech.is  Mon Aug 19 11:07:13 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:13 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 11/12: * MASSIVE CLEANUP
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110713.53DEA991D75@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit b0fb2639a4d00033e91256486dbb9673761993d7
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Aug 19 14:01:28 2019 +0300

    * MASSIVE CLEANUP
    
    * All the data buses are now either 16 or 18 bits wide for consistency
    
    * More consistent naming of micro-operations
    
    * More debugging options (can specify which ladder iteration to dump)
---
 modexpng_fpga_model.py | 1144 ++++++++++++++++++++++++++++--------------------
 1 file changed, 674 insertions(+), 470 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index f57c7b9..71a4b91 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -63,9 +63,13 @@ NUM_MULTS  = 8
 _KEY_LENGTH_HALF = KEY_LENGTH // 2
 
 # width of internal math pipeline
-_WORD_WIDTH = 16
+_WORD_WIDTH     = 16
 _WORD_WIDTH_EXT = 18
 
+_WORD_MASK     = 2 ** _WORD_WIDTH     - 1
+_WORD_MASK_EXT = 2 ** _WORD_WIDTH_EXT - 1
+_CARRY_MASK    = _WORD_MASK ^ _WORD_MASK_EXT
+
 # folder with test vector scripts
 _VECTOR_PATH = "/vector"
 
@@ -76,16 +80,17 @@ _VECTOR_CLASS = "Vector"
 # ------------------
 # Debugging Settings
 # ------------------
-FORCE_OVERFLOW = False
-DUMP_VECTORS = False
-DUMP_INDICES = False
-DUMP_MACS_INPUTS = False
-DUMP_MACS_CLEARING = False
-DUMP_MACS_ACCUMULATION = False
-DUMP_MULT_PARTS = False
-DUMP_RCMB = False
-DUMP_REDUCTION = False
-
+DUMP_LADDER_INDEX      = -1     # at which ladder step to print debug vector
+DUMP_VECTORS           = False  # print entire debug vector components
+DUMP_INDICES           = False  # print indices of words at MAC inputs
+DUMP_MACS_INPUTS       = False  # print MAC input words
+DUMP_MACS_CLEARING     = False  # print MAC clearing bitmaps
+DUMP_MACS_ACCUMULATION = False  # print MAC accumulators contents
+DUMP_MULT_PARTS        = False  # print multiplication output parts
+DUMP_RECOMBINATION     = False  # print recombination internals
+DUMP_REDUCTION         = False  # print reduction internals
+FORCE_OVERFLOW         = False  # force rarely seen internal overflow situation to verify how its handler works
+DUMP_PROGRESS_FACTOR   = 16     # once per how many ladder steps to update progress indicator
 
 #
 # Multi-Precision Integer
@@ -116,7 +121,7 @@ class ModExpNG_Operand():
             if i > 0:
                 if (i % 4) == 0: print("")
                 else:            print(" ", end='')
-            print("%s[%2d] = 18'h%05x;" % (name, i, self.words[i]), end='')
+            print("%s[%3d] = 18'h%05x;" % (name, i, self.words[i]), end='')
         print("")
 
     def _init_from_words(self, words, count):
@@ -127,7 +132,7 @@ class ModExpNG_Operand():
             if words[i] >= (2 ** (_WORD_WIDTH_EXT)):
                 raise Exception("Word is too large!")
 
-        self.words = words
+        self.words = list(words)
 
     def _init_from_number(self, number, length):
 
@@ -160,16 +165,16 @@ class ModExpNG_Operand():
             ret += word << shift
             shift += _WORD_WIDTH
         return ret
-        
+
     def _get_half(self, part):
         num_words = len(self.words)
         num_words_half = num_words // 2
         if not part: return ModExpNG_Operand(None, num_words_half, self.words[:num_words_half])
         else:        return ModExpNG_Operand(None, num_words_half, self.words[num_words_half:])
-        
+
     def lower_half(self):
         return self._get_half(False)
-        
+
     def upper_half(self):
         return self._get_half(True)
 
@@ -222,7 +227,7 @@ class ModExpNG_WideBankEnum(Enum):
     N   = auto()
     L   = auto()
     H   = auto()
-    
+
 class ModExpNG_NarrowBankEnum(Enum):
     A       = auto()
     B       = auto()
@@ -231,7 +236,32 @@ class ModExpNG_NarrowBankEnum(Enum):
     E       = auto()
     N_COEFF = auto()
     I       = auto()
-        
+
+class ModExpNG_CoreInputEnum(Enum):
+    M        = auto()
+
+    N        = auto()
+    P        = auto()
+    Q        = auto()
+
+    N_COEFF  = auto()
+    P_COEFF  = auto()
+    Q_COEFF  = auto()
+
+    N_FACTOR = auto()
+    P_FACTOR = auto()
+    Q_FACTOR = auto()
+
+    X        = auto()
+    Y        = auto()
+
+    QINV     = auto()
+
+class ModExpNG_CoreOutputEnum(Enum):
+    XM = auto()
+    YM = auto()
+    S  = auto()
+
 class ModExpNG_WideBank():
 
     def __init__(self):
@@ -243,7 +273,7 @@ class ModExpNG_WideBank():
         self.n = None
         self.l = None
         self.h = None
-    
+
     def _get_value(self, sel):
         if   sel == ModExpNG_WideBankEnum.A:   return self.a
         elif sel == ModExpNG_WideBankEnum.B:   return self.b
@@ -276,7 +306,7 @@ class ModExpNG_NarrowBank():
         self.e       = None
         self.n_coeff = None
         self.i       = i
-        
+
     def _get_value(self, sel):
         if   sel == ModExpNG_NarrowBankEnum.A:       return self.a
         elif sel == ModExpNG_NarrowBankEnum.B:       return self.b
@@ -296,21 +326,106 @@ class ModExpNG_NarrowBank():
         elif sel == ModExpNG_NarrowBankEnum.N_COEFF: self.n_coeff = value
         else: raise Exception("ModExpNG_NarrowBank._set_value(): Invalid selector!")
 
+class ModExpNG_CoreInput():
+
+    def __init__(self):
+        self._m        = None
+
+        self._n        = None
+        self._p        = None
+        self._q        = None
+
+        self._n_coeff  = None
+        self._p_coeff  = None
+        self._q_coeff  = None
+
+        self._n_factor = None
+        self._p_factor = None
+        self._q_factor = None
+
+        self._x        = None
+        self._y        = None
+
+        self._qinv     = None
+
+    def set_value(self, sel, value):
+        if   sel == ModExpNG_CoreInputEnum.M:        self._m        = value
+
+        elif sel == ModExpNG_CoreInputEnum.N:        self._n        = value
+        elif sel == ModExpNG_CoreInputEnum.P:        self._p        = value
+        elif sel == ModExpNG_CoreInputEnum.Q:        self._q        = value
+
+        elif sel == ModExpNG_CoreInputEnum.N_COEFF:  self._n_coeff  = value
+        elif sel == ModExpNG_CoreInputEnum.P_COEFF:  self._p_coeff  = value
+        elif sel == ModExpNG_CoreInputEnum.Q_COEFF:  self._q_coeff  = value
+
+        elif sel == ModExpNG_CoreInputEnum.N_FACTOR: self._n_factor = value
+        elif sel == ModExpNG_CoreInputEnum.P_FACTOR: self._p_factor = value
+        elif sel == ModExpNG_CoreInputEnum.Q_FACTOR: self._q_factor = value
+
+        elif sel == ModExpNG_CoreInputEnum.X:        self._x        = value
+        elif sel == ModExpNG_CoreInputEnum.Y:        self._y        = value
+
+        elif sel == ModExpNG_CoreInputEnum.QINV:     self._qinv     = value
+
+        else: raise Exception("ModExpNG_CoreInput.set_value(): invalid selector!")
+
+    def _get_value(self, sel):
+        if   sel == ModExpNG_CoreInputEnum.M:        return self._m
+
+        elif sel == ModExpNG_CoreInputEnum.N:        return self._n
+        elif sel == ModExpNG_CoreInputEnum.P:        return self._p
+        elif sel == ModExpNG_CoreInputEnum.Q:        return self._q
+
+        elif sel == ModExpNG_CoreInputEnum.N_COEFF:  return self._n_coeff
+        elif sel == ModExpNG_CoreInputEnum.P_COEFF:  return self._p_coeff
+        elif sel == ModExpNG_CoreInputEnum.Q_COEFF:  return self._q_coeff
+
+        elif sel == ModExpNG_CoreInputEnum.N_FACTOR: return self._n_factor
+        elif sel == ModExpNG_CoreInputEnum.P_FACTOR: return self._p_factor
+        elif sel == ModExpNG_CoreInputEnum.Q_FACTOR: return self._q_factor
+
+        elif sel == ModExpNG_CoreInputEnum.X:        return self._x
+        elif sel == ModExpNG_CoreInputEnum.Y:        return self._y
+
+        elif sel == ModExpNG_CoreInputEnum.QINV:     return self._qinv
+
+        else: raise Exception("ModExpNG_CoreInput._get_value(): invalid selector!")
+
+class ModExpNG_CoreOutput():
+
+    def __init__(self):
+        self._xm = None
+        self._ym = None
+        self._s  = None
+
+    def _set_value(self, sel, value):
+        if   sel == ModExpNG_CoreOutputEnum.XM: self._xm = value
+        elif sel == ModExpNG_CoreOutputEnum.YM: self._ym = value
+        elif sel == ModExpNG_CoreOutputEnum.S:  self._s  = value
+        else: raise Exception("ModExpNG_CoreOutput._set_value(): invalid selector!")
+
+    def get_value(self, sel):
+        if   sel == ModExpNG_CoreOutputEnum.XM: return self._xm
+        elif sel == ModExpNG_CoreOutputEnum.YM: return self._ym
+        elif sel == ModExpNG_CoreOutputEnum.S:  return self._s
+        else: raise Exception("ModExpNG_CoreOutput.get_value(): invalid selector!")
+
 class ModExpNG_BanksPair():
-    
+
     def __init__(self, i):
         self.wide = ModExpNG_WideBank()
         self.narrow = ModExpNG_NarrowBank(i)
-        
+
     def _get_wide(self, sel):
         return self.wide._get_value(sel)
 
     def _get_narrow(self, sel):
         return self.narrow._get_value(sel)
-        
+
     def _set_wide(self, sel, value):
         self.wide._set_value(sel, value)
-        
+
     def _set_narrow(self, sel, value):
         self.narrow._set_value(sel, value)
 
@@ -319,20 +434,6 @@ class ModExpNG_BanksLadder():
     def __init__(self, i):
         self.ladder_x = ModExpNG_BanksPair(i)
         self.ladder_y = ModExpNG_BanksPair(i)
-        
-    def set_modulus(self, n, n_coeff):
-        self.ladder_x.wide._set_value(ModExpNG_WideBankEnum.N, n)
-        self.ladder_y.wide._set_value(ModExpNG_WideBankEnum.N, n)
-        self.ladder_x.narrow._set_value(ModExpNG_NarrowBankEnum.N_COEFF, n_coeff)
-        self.ladder_y.narrow._set_value(ModExpNG_NarrowBankEnum.N_COEFF, n_coeff)
-        
-    def set_operands_crt_xy(self, sel_wide, sel_narrow, x, y):
-        if sel_wide is not None:
-            self.ladder_x.wide._set_value(sel_wide, x)
-            self.ladder_y.wide._set_value(sel_wide, x)
-        if sel_narrow is not None:
-            self.ladder_x.narrow._set_value(sel_narrow, y)
-            self.ladder_y.narrow._set_value(sel_narrow, y)
 
 class ModExpNG_BanksCRT():
 
@@ -350,7 +451,7 @@ class ModExpNG_PartRecombinator():
 
     def _flush_pipeline(self, dump):
         self.z0, self.y0, self.x0 = 0, 0, 0
-        if dump and DUMP_RCMB:
+        if dump and DUMP_RECOMBINATION:
             print("RCMB -> flush()")
 
     def _push_pipeline(self, part, dump):
@@ -366,13 +467,13 @@ class ModExpNG_PartRecombinator():
         x1 = x + self.y0 + (self.x0 >> _WORD_WIDTH) # IMPORTANT: This carry can be up to two bits wide!!
 
         # save lower 16 bits of the rightmost cell
-        t = self.x0 & 0xffff
+        t = self.x0 & _WORD_MASK
 
         # update internal latches
         self.z0, self.y0, self.x0 = z1, y1, x1
 
         # dump
-        if dump and DUMP_RCMB:
+        if dump and DUMP_RECOMBINATION:
             print("RCMB -> push(): part = 0x%012x, word = 0x%04x" % (part, t))
 
         # done
@@ -416,7 +517,7 @@ class ModExpNG_PartRecombinator():
         # merge upper half adding the two overlapping words
         for x in range(ab_num_words):
             next_word = words_msb[x]
-            if x < 2:                
+            if x < 2:
                 next_word += words_lsb[x + ab_num_words]
             words.append(next_word)
 
@@ -469,7 +570,7 @@ class ModExpNG_PartRecombinator():
 
             if i > 0:
                 words_msb.append(next_word)
-                
+
         # merge words
         words = list()
 
@@ -522,10 +623,10 @@ class ModExpNG_WordMultiplier():
 
     def _update_one_mac(self, x, t, col, a, b, dump, need_aux=False):
 
-        if a > 0x3FFFF:
+        if a >= (2 ** _WORD_WIDTH_EXT):
             raise Exception("a > 0x3FFFF!")
 
-        if b > 0xFFFF:
+        if b >= (2 ** _WORD_WIDTH):
             raise Exception("b > 0xFFFF!")
 
         p = a * b
@@ -534,20 +635,21 @@ class ModExpNG_WordMultiplier():
             if x > 0: print("; ", end='')
             print("MAC[%d]: a=%05x" % (x, a), end='')
             if x == (NUM_MULTS-1) and not need_aux: print("")
-            
+
         self._macs[x] += p
 
     def _update_mac_aux(self, y, col, a, b, dump):
-        
-        if a > 0x3FFFF:
+
+        if a >= (2 ** _WORD_WIDTH_EXT):
             raise Exception("a > 0x3FFFF!")
 
-        if b > 0xFFFF:
+        if b >= (2 ** _WORD_WIDTH):
             raise Exception("b > 0xFFFF!")
 
         p = a * b
         if dump and DUMP_MACS_INPUTS:
             print("; AUX: a=%05x" % a)
+            
         self._mac_aux[0] += p
 
     def _preset_indices(self, col):
@@ -617,9 +719,9 @@ class ModExpNG_WordMultiplier():
             parts.append(0)
 
         for col in range(num_cols):
-        
+
             b_carry = 0
-        
+
             for t in range(ab_num_words):
 
                 # take care of indices
@@ -638,10 +740,13 @@ class ModExpNG_WordMultiplier():
                 if dump and DUMP_INDICES: self._dump_indices(t, col)
 
                 # current b-word
-                # TODO: Explain how the 18th bit carry works!!
+                # multiplier's b-input is limited to 16-bit words, so we need to propagate
+                # carries on the fly here, carry can be up to two bits
                 bt = b_narrow.words[t] + b_carry
-                b_carry = (bt & 0x30000) >> 16
-                bt &= 0xFFFF
+                b_carry = (bt & _CARRY_MASK) >> _WORD_WIDTH
+                if dump and b_carry > 1:
+                    print("Rare overflow case was detected and then successfully corrected.")
+                bt &= _WORD_MASK
 
                 # multiply by a-words
                 for x in range(NUM_MULTS):
@@ -781,13 +886,8 @@ class ModExpNG_WordMultiplier():
 
 class ModExpNG_LowlevelOperator():
 
-    def __init__(self):
-        self._word_mask = 0
-        for x in range(_WORD_WIDTH):
-            self._word_mask |= (1 << x)
-
     def _check_word(self, a):
-        if a < 0 or a >= (2 ** _WORD_WIDTH):
+        if a < 0 or a > _WORD_MASK:
             raise Exception("Word out of range!")
 
     def _check_carry_borrow(self, cb):
@@ -802,13 +902,13 @@ class ModExpNG_LowlevelOperator():
 
         sum = a + b + c_in
 
-        sum_s = sum & self._word_mask
-        sum_c = (sum >> _WORD_WIDTH) & 1
+        sum_s = sum & _WORD_MASK
+        sum_c = sum >> _WORD_WIDTH
 
         return (sum_c, sum_s)
 
     def sub_words(self, a, b, b_in):
-    
+
         self._check_word(a)
         self._check_word(b)
         self._check_carry_borrow(b_in)
@@ -827,127 +927,44 @@ class ModExpNG_LowlevelOperator():
 class ModExpNG_Worker():
 
     def __init__(self):
-        self.recombinator = ModExpNG_PartRecombinator()
-        self.multiplier   = ModExpNG_WordMultiplier()
         self.lowlevel     = ModExpNG_LowlevelOperator()
+        self.multiplier   = ModExpNG_WordMultiplier()
+        self.recombinator = ModExpNG_PartRecombinator()
 
-    def exponentiate(self, iz, bz, e, n, n_factor, n_coeff, num_words, dump_index=-1, dump_mode=""):
-
-        # working variables
-        t1, t2 = iz, bz
-
-        # length-1, length-2, length-3, ..., 1, 0 (left-to-right)
-        for bit in range(_WORD_WIDTH * num_words - 1, -1, -1):
-
-            bit_value = (e.number() & (1 << bit)) >> bit
-
-            if bit > 500:
-                print("%s: bit=#%d (%d)" % (dump_mode, bit, bit_value))
-                print("")
-                print("%s_T1_BEFORE: %s" % (dump_mode, hex(t1.number())))
-                print("%s_T2_BEFORE: %s" % (dump_mode, hex(t2.number())))
-                print("")
-            else:
-                return None
-
-
-            debug_dump = bit == dump_index
-
-            
-            if debug_dump:
-                print("\rladder_mode = %d" % bit_value)
-                
-                if FORCE_OVERFLOW:
-                    T1X = list(t1.words)
-                    for i in range(num_words):
-                        if i > 0:
-                            bits = T1X[i-1] & (3 << 16)
-                            if bits == 0:
-                                bits = T1X[i] & 3
-                                T1X[i] = T1X[i] ^ bits
-                                T1X[i-1] |= (bits << 16)
-                                    
-                    for i in range(num_words):
-                        t1.words[i] = T1X[i]
-                
-                if DUMP_VECTORS:
-                    print("num_words = %d" % num_words)
-                    t1.format_verilog_concat("%s_T1" % dump_mode)
-                    t2.format_verilog_concat("%s_T2" % dump_mode)
-                    n.format_verilog_concat("%s_N" % dump_mode)
-                    n_coeff.format_verilog_concat("%s_N_COEFF"  % dump_mode)
-                            # force the rarely seen overflow
-
-            if bit_value:
-                p1 = self.multiply(t1, t2, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="X")
-                p2 = self.multiply(t2, t2, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="Y")
-            else:
-                p1 = self.multiply(t1, t1, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="X")
-                p2 = self.multiply(t2, t1, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="Y")
-
-            t1, t2 = p1, p2
-
-            if debug_dump and DUMP_VECTORS:
-                t1.format_verilog_concat("%s_X" % dump_mode)
-                t2.format_verilog_concat("%s_Y" % dump_mode)
-
-            if (bit % 16) == 0:
-                pct = float((_WORD_WIDTH * num_words - bit) / (_WORD_WIDTH * num_words)) * 100.0
-                print("\rpct: %5.1f%%" % pct, end='')
-        
-        print("")
-
-        return t1
-
-    def subtract(self, a, b, n, ab_num_words):
-
+    def serial_subtract_modular(self, a, b, n, ab_num_words):
         c_in = 0
         b_in = 0
-
         ab = list()
         ab_n = list()
-
         for x in range(ab_num_words):
-
             a_word = a.words[x]
             b_word = b.words[x]
-
             (b_out, d_out) = self.lowlevel.sub_words(a_word, b_word, b_in)
             (c_out, s_out) = self.lowlevel.add_words(d_out, n.words[x], c_in)
-
             ab.append(d_out)
             ab_n.append(s_out)
-
             (c_in, b_in) = (c_out, b_out)
-
         d = ab if not b_out else ab_n
-
         return ModExpNG_Operand(None, ab_num_words, d)
 
-    def add(self, a, b, ab_num_words):
-
+    def serial_add_uneven(self, a, b, ab_num_words):
         c_in = 0
-
         ab = list()
-
         for x in range(2 * ab_num_words):
-
             a_word = a.words[x] if x < ab_num_words else 0
             b_word = b.words[x]
-
             (c_out, s_out) = self.lowlevel.add_words(a_word, b_word, c_in)
-
             ab.append(s_out)
-
             c_in = c_out
-
         return ModExpNG_Operand(None, 2*ab_num_words, ab)
 
-    def multiply(self, a, b, n, n_coeff, ab_num_words, reduce_only=False, multiply_only=False, dump=False, dump_mode="", dump_phase=""):
+    def multipurpose_multiply(self, a, b, n, n_coeff, ab_num_words, reduce_only=False, multiply_only=False, dump=False, dump_crt="", dump_ladder=""):
 
+        #
         # 1. AB = A * B
-        if dump: print("multiply_square(%s_%s)" % (dump_mode, dump_phase))
-        
+        #
+        if dump: print("multiply_square(%s_%s)" % (dump_crt, dump_ladder))
+
         if reduce_only:
             ab = b
         else:
@@ -956,169 +973,220 @@ class ModExpNG_Worker():
             ab = ModExpNG_Operand(None, 2 * ab_num_words, ab_words)
 
         if dump and DUMP_VECTORS:
-            ab.format_verilog_concat("%s_%s_AB" % (dump_mode, dump_phase))
+            ab.format_verilog_concat("%s_%s_AB" % (dump_crt, dump_ladder))
 
         if multiply_only:
             return ModExpNG_Operand(None, 2*ab_num_words, ab_words)
 
-            
+        #
         # 2. Q = LSB(AB) * N_COEFF
-        if dump: print("multiply_triangle(%s_%s)" % (dump_mode, dump_phase))
-        
+        #
+        if dump: print("multiply_triangle(%s_%s)" % (dump_crt, dump_ladder))
+
         q_parts = self.multiplier.multiply_triangle(ab, n_coeff, ab_num_words, dump)
         q_words = self.recombinator.recombine_triangle(q_parts, ab_num_words, dump)
         q = ModExpNG_Operand(None, ab_num_words + 1, q_words)
 
         if dump and DUMP_VECTORS:
-            q.format_verilog_concat("%s_%s_Q" % (dump_mode, dump_phase))
+            q.format_verilog_concat("%s_%s_Q" % (dump_crt, dump_ladder))
 
+        #
         # 3. M = Q * N
-        if dump: print("multiply_rectangle(%s_%s)" % (dump_mode, dump_phase))
-        
+        #
+        if dump: print("multiply_rectangle(%s_%s)" % (dump_crt, dump_ladder))
+
         m_parts = self.multiplier.multiply_rectangle(n, q, ab_num_words, dump)
         m_words = self.recombinator.recombine_rectangle(m_parts, ab_num_words, dump)
         m = ModExpNG_Operand(None, 2 * ab_num_words + 1, m_words)
-        
-        if dump and DUMP_VECTORS:
-            m.format_verilog_concat("%s_%s_M" % (dump_mode, dump_phase))
 
-        if (m.number() != (q.number() * n.number())):
-            print("MISMATCH")
-            sys.exit()
+        if dump and DUMP_VECTORS:
+            m.format_verilog_concat("%s_%s_M" % (dump_crt, dump_ladder))
 
-            
+        #
         # 4. R = AB + M
-        
+        #
+
+        #
         # 4a. compute carry (actual sum is all zeroes and need not be stored)
+        #
+        
         r_cy = 0 # this can be up to two bits, since we're adding extended words!!
         for i in range(ab_num_words + 1):
             s = ab.words[i] + m.words[i] + r_cy
-            r_cy_new = s >> 16
-            
+            r_cy_new = s >> _WORD_WIDTH
+
             if dump and DUMP_REDUCTION:
                 print("[%2d] 0x%05x + 0x%05x + 0x%x => {0x%x, [0x%05x]}" %
-                    (i, ab.words[i], m.words[i], r_cy, r_cy_new, s & 0xffff))
-                
+                    (i, ab.words[i], m.words[i], r_cy, r_cy_new, s & 0xffff))   # ???
+
             r_cy = r_cy_new
-        
-        
+
+
+        #
         # 4b. Initialize empty result
+        #
+        
         R = list()
         for i in range(ab_num_words):
             R.append(0)
 
+        #
         # 4c. compute the actual upper part of sum (take carry into account)
-        for i in range(ab_num_words):
+        #
         
+        for i in range(ab_num_words):
+
             if dump and DUMP_REDUCTION:
                 print("[%2d]" % i, end='')
-                
+
             ab_word = ab.words[ab_num_words + i + 1] if i < (ab_num_words - 1) else 0
             if dump and DUMP_REDUCTION:
                 print(" 0x%05x" % ab_word, end='')
-                
+
             m_word = m.words[ab_num_words + i + 1]
             if dump and DUMP_REDUCTION:
                 print(" + 0x%05x" % m_word, end='')
-                
+
             if i == 0: R[i] = r_cy
             else:      R[i] = 0
-            
-            if (r_cy > 3): print("\rR_CY = %d!" % r_cy)
-            
+
             if dump and DUMP_REDUCTION:
                 print(" + 0x%x" % R[i], end='')
-                
+
             R[i] += ab_word
             R[i] += m_word
             if dump and DUMP_REDUCTION:
                 print(" = 0x%05x" % R[i])
-                        
+
         return ModExpNG_Operand(None, ab_num_words, R)
-    
-    def reduce(self, a, num_words, carry_in=0):
-        carry = carry_in
+
+    def convert_nonredundant(self, a, num_words):
+        carry = 0
         for x in range(num_words):
             a.words[x] += carry
-            carry = (a.words[x] >> _WORD_WIDTH) & 3
-            a.words[x] &= self.lowlevel._word_mask
+            carry = a.words[x] >> _WORD_WIDTH
+            a.words[x] &= _WORD_MASK
         return carry
 
-class ModExpNG_CoreOutputEnum(Enum):
-    XM = auto()
-    YM = auto()
-    S  = auto()
-            
-class ModExpNG_CoreOutput():
-    
-    def __init__(self):
-        self._xm = None
-        self._ym = None
-        self._s  = None
-        
-    def _set_value(self, sel, value):
-        if   sel == ModExpNG_CoreOutputEnum.XM: self._xm = value
-        elif sel == ModExpNG_CoreOutputEnum.YM: self._ym = value
-        elif sel == ModExpNG_CoreOutputEnum.S:  self._s  = value
-        else: raise Exception("ModExpNG_CoreOutput._set_value(): invalid selector!")
-        
-    def get_value(self, sel):
-        if   sel == ModExpNG_CoreOutputEnum.XM: return self._xm
-        elif sel == ModExpNG_CoreOutputEnum.YM: return self._ym
-        elif sel == ModExpNG_CoreOutputEnum.S:  return self._s
-        else: raise Exception("ModExpNG_CoreOutput.get_value(): invalid selector!")
-            
 class ModExpNG_Core():
-    
+
     def __init__(self, i):
         self.wrk = ModExpNG_Worker()
         self.bnk = ModExpNG_BanksCRT(i)
+        self.inp = ModExpNG_CoreInput()
         self.out = ModExpNG_CoreOutput()
+
+    #
+    # CRT_(X|Y) means either CRT_X or CRT_Y
+    # LADDER_{X,Y} means both LADDER_X and LADDER_Y
+    #
+
+    #
+    # copy from CRT_(X|Y).LADDER_X.NARROW to OUTPUT
+    #
+    def set_output_from_narrow(self, sel_output, bank_crt, sel_narrow):
+        self.out._set_value(sel_output, bank_crt.ladder_x.narrow._get_value(sel_narrow))
+
+    #
+    # copy from INPUT to CRT_(X|Y).LADDER_{X,Y}.NARROW
+    #
+    def set_narrow_from_input(self, bank_crt, sel_narrow, sel_input):
+        bank_crt.ladder_x._set_narrow(sel_narrow, self.inp._get_value(sel_input))
+        bank_crt.ladder_y._set_narrow(sel_narrow, self.inp._get_value(sel_input))
+
+    #
+    # copy from INPUT to CRT_(X|Y).LADDER_{X,Y}.WIDE
+    #
+    def set_wide_from_input(self, bank_crt, sel_wide, sel_input):
+        bank_crt.ladder_x._set_wide(sel_wide, self.inp._get_value(sel_input))
+        bank_crt.ladder_y._set_wide(sel_wide, self.inp._get_value(sel_input))
+
+    #
+    # copy from CRT_Y.LADDER_{X,Y).{WIDE,NARROW} to CRT_X.LADDER_{X,Y}.{WIDE,NARROW}
+    #
+    def copy_crt_y2x(self, sel_wide, sel_narrow):
+
+        self.bnk.crt_x.ladder_x._set_wide(sel_wide, self.bnk.crt_y.ladder_x._get_wide(sel_wide))
+        self.bnk.crt_x.ladder_y._set_wide(sel_wide, self.bnk.crt_y.ladder_y._get_wide(sel_wide))
+
+        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow))
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow, self.bnk.crt_y.ladder_y._get_narrow(sel_narrow))
+
+    #
+    # copy from CRT_{X,Y}.LADDER_X.{WIDE,NARROW} to CRT_{X,Y}.LADDER_Y.{WIDE,NARROW}
+    #
+    def copy_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
+
+        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, self.bnk.crt_x.ladder_x._get_wide(sel_wide_in))
+        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, self.bnk.crt_y.ladder_x._get_wide(sel_wide_in))
+
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in))
+        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in))
+
+    #
+    # copy from CRT_{X,Y}.LADDER_X.{WIDE,NARROW} to CRT_{Y,X}.LADDER_Y.{WIDE,NARROW}
+    #
+    def cross_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
+
+        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, self.bnk.crt_y.ladder_x._get_wide(sel_wide_in))
+        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, self.bnk.crt_x.ladder_x._get_wide(sel_wide_in))
         
-    def modular_multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True)):
-        
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in))
+        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in))
+
+    #
+    # modular multiply sel_wide_in by sel_narrow_in
+    # stores intermediate result in WIDE.L and WIDE.H
+    # needs modulus WIDE.N and speed-up coefficients NARROW.N_COEFF to be filled
+    # places two copies of resulting quantity in sel_wide_out and sel_narrow_out
+    # sel_*_in and sel_*_out can overlap (overwriting of input operands is ok)
+    #
+    def modular_multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True), d=False):
+
         xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
         yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        
+
         xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
         yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        
+
         xxa       = self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in)
         xya       = self.bnk.crt_x.ladder_y.wide._get_value(sel_wide_in)
 
         yxa       = self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in)
         yya       = self.bnk.crt_y.ladder_y.wide._get_value(sel_wide_in)
-        
+
         xxb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
         xyb       = self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow_in)
 
         yxb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
         yyb       = self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow_in)
-        
+
         if not mode[0]: xb = xxb
         else:           xb = xyb
 
         if not mode[1]: yb = yxb
         else:           yb = yyb
 
-        xxp = self.wrk.multiply(xxa, xb, xn, xn_coeff, num_words)
-        xyp = self.wrk.multiply(xya, xb, xn, xn_coeff, num_words)
+        xxp = self.wrk.multipurpose_multiply(xxa, xb, xn, xn_coeff, num_words, dump=d, dump_crt="X", dump_ladder="X")
+        xyp = self.wrk.multipurpose_multiply(xya, xb, xn, xn_coeff, num_words, dump=d, dump_crt="X", dump_ladder="Y")
 
-        yxp = self.wrk.multiply(yxa, yb, yn, yn_coeff, num_words)
-        yyp = self.wrk.multiply(yya, yb, yn, yn_coeff, num_words)
-        
-        if sel_wide_out is not None:
-            self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xxp)
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, xyp)
-            self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yxp)
-            self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, yyp)
-        
-        if sel_narrow_out is not None:
-            self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xxp)
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, xyp)
-            self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yxp)
-            self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yyp)
+        yxp = self.wrk.multipurpose_multiply(yxa, yb, yn, yn_coeff, num_words, dump=d, dump_crt="Y", dump_ladder="X")
+        yyp = self.wrk.multipurpose_multiply(yya, yb, yn, yn_coeff, num_words, dump=d, dump_crt="Y", dump_ladder="Y")
+
+        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xxp)
+        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, xyp)
+        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yxp)
+        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, yyp)
+
+        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xxp)
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xyp)
+        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yxp)
+        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yyp)
 
+    #
+    # modular subtract values in sel_narrow_in (X-Y)
+    # stores two copies of the result in sel_*_out
+    #
     def modular_subtract(self, sel_narrow_in, sel_narrow_out, sel_wide_out, num_words):
 
         xa = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
@@ -1128,123 +1196,105 @@ class ModExpNG_Core():
         ya = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
         yb = self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow_in)
         yn = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        
-        xd = self.wrk.subtract(xa, xb, xn, num_words)
-        yd = self.wrk.subtract(ya, yb, yn, num_words)
-        
+
+        xd = self.wrk.serial_subtract_modular(xa, xb, xn, num_words)
+        yd = self.wrk.serial_subtract_modular(ya, yb, yn, num_words)
+
         self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xd)
         self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yd)
 
         self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xd)
         self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yd)
-        
-    def reduce_narrow(self, sel_narrow, num_words):
-        self.wrk.reduce(self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow), num_words)
-        self.wrk.reduce(self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow), num_words)
-        self.wrk.reduce(self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow), num_words)
-        self.wrk.reduce(self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow), num_words)
+    
+    #
+    # modular reduce sel_narrow_in
+    # stores two copies of the result in sel_*_out
+    #
+    def modular_reduce(self, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words):
+
+        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+
+        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
 
+        xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
+        yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
+
+        xp = self.wrk.multipurpose_multiply(None, xb, xn, xn_coeff, num_words, reduce_only=True)
+        yp = self.wrk.multipurpose_multiply(None, yb, yn, yn_coeff, num_words, reduce_only=True)
+
+        self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xp)
+        self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, xp)
+        self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yp)
+        self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, yp)
+
+        self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xp)
+        self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, xp)
+        self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yp)
+        self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yp)
+
+    #
+    # propagate carries (convert to non-redundant representation) content in sel_narrow
+    # overwrites input value
+    #
+    def propagate_carries(self, sel_narrow, num_words):
+        self.wrk.convert_nonredundant(self.bnk.crt_x.ladder_x._get_narrow(sel_narrow), num_words)
+        self.wrk.convert_nonredundant(self.bnk.crt_x.ladder_y._get_narrow(sel_narrow), num_words)
+        self.wrk.convert_nonredundant(self.bnk.crt_y.ladder_x._get_narrow(sel_narrow), num_words)
+        self.wrk.convert_nonredundant(self.bnk.crt_y.ladder_y._get_narrow(sel_narrow), num_words)
+
+    #
+    # copy from CRT_{X,Y}.LADDER_{X,Y}.WIDE.{H,L} to CRT_{X,Y}.LADDER_{X,Y}.NARROW
+    #
     def merge_lha(self, sel_narrow, num_words):
         xx_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
-        xy_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
-        yx_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
-        yy_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
-        
+        xy_lsb = self.bnk.crt_x.ladder_y._get_wide(ModExpNG_WideBankEnum.L)
+        yx_lsb = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
+        yy_lsb = self.bnk.crt_y.ladder_y._get_wide(ModExpNG_WideBankEnum.L)
+
         xx_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
-        xy_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
-        yx_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
-        yy_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
-        
+        xy_msb = self.bnk.crt_x.ladder_y._get_wide(ModExpNG_WideBankEnum.H)
+        yx_msb = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
+        yy_msb = self.bnk.crt_y.ladder_y._get_wide(ModExpNG_WideBankEnum.H)
+
         xx = xx_lsb.words + xx_msb.words
         xy = xy_lsb.words + xy_msb.words
         yx = yx_lsb.words + yx_msb.words
         yy = yy_lsb.words + yy_msb.words
-        
+
         self.bnk.crt_x.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xx))
         self.bnk.crt_x.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xy))
         self.bnk.crt_y.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yx))
         self.bnk.crt_y.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yy))
 
-    def modular_reduce(self, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words):
-    
-        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        
-        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        
-        xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
-        yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
-        
-        xp = self.wrk.multiply(None, xb, xn, xn_coeff, num_words, reduce_only=True)
-        yp = self.wrk.multiply(None, yb, yn, yn_coeff, num_words, reduce_only=True)
-        
-        if sel_wide_out is not None:
-            self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xp)
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, xp)
-            self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yp)
-            self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, yp)
-        
-        if sel_narrow_out is not None:
-            self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xp)
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, xp)
-            self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yp)
-            self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yp)
-
-    def set_output(self, sel_output, banks_ladder, sel_narrow):
-        self.out._set_value(sel_output, banks_ladder.ladder_x.narrow._get_value(sel_narrow))
-    
-    def move_crt_y2x(self, sel_wide, sel_narrow):
-    
-        if sel_wide is not None:
-            self.bnk.crt_x.ladder_x.wide._set_value(sel_wide, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide))
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide, self.bnk.crt_y.ladder_y.wide._get_value(sel_wide))
-
-        if sel_narrow is not None:
-            self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow))
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow, self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow))
-
-    def move_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
-        
-        if sel_wide_out is not None:
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in))
-            self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in))
-
-        if sel_narrow_out is not None:
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in))
-            self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in))
-
-    def flip_ladder_y2x(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
-        
-        if sel_wide_out is not None:
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in))
-
-        if sel_narrow_out is not None:
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in))
-
-    def just_multiply(self, sel_wide_in, sel_narrow_in, num_words):
+    #
+    # multiply sel_wide_in by sel_narrow_in
+    # stores twice larger product in WIDE.L and WIDE.H
+    #
+    def regular_multiply(self, sel_wide_in, sel_narrow_in, num_words):
 
         xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
         yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        
+
         xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
         yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        
+
         xxa       = self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in)
         xya       = self.bnk.crt_x.ladder_y.wide._get_value(sel_wide_in)
 
         yxa       = self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in)
         yya       = self.bnk.crt_y.ladder_y.wide._get_value(sel_wide_in)
-        
+
         xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
         yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
-        
-        xxp = self.wrk.multiply(xxa, xb, None, None, num_words, multiply_only=True)
-        xyp = self.wrk.multiply(xya, xb, None, None, num_words, multiply_only=True)
 
-        yxp = self.wrk.multiply(yxa, yb, None, None, num_words, multiply_only=True)
-        yyp = self.wrk.multiply(yya, yb, None, None, num_words, multiply_only=True)
-        
+        xxp = self.wrk.multipurpose_multiply(xxa, xb, None, None, num_words, multiply_only=True)
+        xyp = self.wrk.multipurpose_multiply(xya, xb, None, None, num_words, multiply_only=True)
+
+        yxp = self.wrk.multipurpose_multiply(yxa, yb, None, None, num_words, multiply_only=True)
+        yyp = self.wrk.multipurpose_multiply(yya, yb, None, None, num_words, multiply_only=True)
+
         xxp_lsb = xxp.lower_half()
         xxp_msb = xxp.upper_half()
 
@@ -1256,7 +1306,7 @@ class ModExpNG_Core():
 
         yyp_lsb = yyp.lower_half()
         yyp_msb = yyp.upper_half()
-                
+
         self.bnk.crt_x.ladder_x.wide._set_value(ModExpNG_WideBankEnum.L, xxp_lsb)
         self.bnk.crt_x.ladder_y.wide._set_value(ModExpNG_WideBankEnum.L, xyp_lsb)
         self.bnk.crt_y.ladder_x.wide._set_value(ModExpNG_WideBankEnum.L, yxp_lsb)
@@ -1266,8 +1316,12 @@ class ModExpNG_Core():
         self.bnk.crt_x.ladder_y.wide._set_value(ModExpNG_WideBankEnum.H, xyp_msb)
         self.bnk.crt_y.ladder_x.wide._set_value(ModExpNG_WideBankEnum.H, yxp_msb)
         self.bnk.crt_y.ladder_y.wide._set_value(ModExpNG_WideBankEnum.H, yyp_msb)
-    
-    def just_add(self, sel_narrow_a_in, sel_narrow_b_in, sel_narrow_out, num_words):
+
+    #
+    # adds sel_narrow_a_in to sel_narrow_b_in
+    # stores result in sel_narrow_out
+    #
+    def regular_add(self, sel_narrow_a_in, sel_narrow_b_in, sel_narrow_out, num_words):
         xxa = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_a_in)
         xya = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_a_in)
         yxa = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_a_in)
@@ -1277,41 +1331,317 @@ class ModExpNG_Core():
         xyb = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_b_in)
         yxb = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_b_in)
         yyb = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_b_in)
-        
-        xxc = self.wrk.add(xxa, xxb, num_words)
-        xyc = self.wrk.add(xya, xyb, num_words)
-        yxc = self.wrk.add(yxa, yxb, num_words)
-        yyc = self.wrk.add(yya, yyb, num_words)
-        
+
+        xxc = self.wrk.serial_add_uneven(xxa, xxb, num_words)
+        xyc = self.wrk.serial_add_uneven(xya, xyb, num_words)
+        yxc = self.wrk.serial_add_uneven(yxa, yxb, num_words)
+        yyc = self.wrk.serial_add_uneven(yya, yyb, num_words)
+
         self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xxc)
         self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xyc)
         self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yxc)
         self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yyc)
-        
-        
+
+    #
+    # dump working variables before ladder step
+    #
+    def dump_before_step_crt(self, pq, m):
+        print("num_words = %d" % pq)
+        print("\rladder_mode_x = %d" % m[0])
+        print("\rladder_mode_y = %d" % m[1])
+        self.bnk.crt_x.ladder_x._get_narrow(N.C).format_verilog_concat("X_X")
+        self.bnk.crt_x.ladder_y._get_narrow(N.C).format_verilog_concat("X_Y")
+        self.bnk.crt_y.ladder_x._get_narrow(N.C).format_verilog_concat("Y_X")
+        self.bnk.crt_y.ladder_y._get_narrow(N.C).format_verilog_concat("Y_Y")
+        self.bnk.crt_x.ladder_x._get_wide(W.N).format_verilog_concat("X_N")
+        self.bnk.crt_x.ladder_x._get_wide(W.N).format_verilog_concat("Y_N")
+        self.bnk.crt_x.ladder_x._get_narrow(N.N_COEFF).format_verilog_concat("X_N_COEFF")
+        self.bnk.crt_x.ladder_x._get_narrow(N.N_COEFF).format_verilog_concat("Y_N_COEFF")
+
+    #
+    # dump working variables after ladder step
+    #
+    def dump_after_step_crt(self):
+        self.bnk.crt_x.ladder_x._get_narrow(N.C).format_verilog_concat("X_X")
+        self.bnk.crt_x.ladder_y._get_narrow(N.C).format_verilog_concat("X_Y")
+        self.bnk.crt_y.ladder_x._get_narrow(N.C).format_verilog_concat("Y_X")
+        self.bnk.crt_y.ladder_y._get_narrow(N.C).format_verilog_concat("Y_Y")
+
+    #
+    # this deliberately converts narrow operand into redundant representation
+    #
+    def _force_overflow(self, bank_crt, sel_narrow):
+
+        # original words
+        T = bank_crt.ladder_x._get_narrow(sel_narrow).words
+
+        # loop through upper N-1 words
+        for i in range(1, len(T)):
+
+            # get msbs of the previous word
+            upper_bits = T[i-1] & _CARRY_MASK
+
+            # if the previous msbs are empty, force lsbs of the current word
+            # into them and then wipe the current lsbs
+            if upper_bits == 0:
+                lower_bits = T[i] & (_CARRY_MASK >> _WORD_WIDTH)
+                T[i] ^= lower_bits
+                T[i-1] |= (lower_bits << _WORD_WIDTH)
+
+        # overwrite original words
+        bank_crt.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, len(T), T))
+
+        print("Forced overflow.")
+
+#
+# read content of core's output bank and compare it against known good values
+#
+def compare_signature():
+
+    c  = core
+    s  = s_known
+    xm = xm_known
+    ym = ym_known
+
+    core_s  = c.out.get_value(O.S)
+    core_xm = c.out.get_value(O.XM)
+    core_ym = c.out.get_value(O.YM)
+
+    if core_s.number()  != s:  print("ERROR: core_s != s!")
+    else:                      print("s is OK")
+
+    if core_xm.number() != xm: print("ERROR: core_xm != xm!")
+    else:                      print("x_mutated is OK")
+
+    if core_ym.number() != ym: print("ERROR: core_ym != ym!")
+    else:                      print("y_mutated is OK")
+
+#
+# get current ladder mode based on two exponents' bits
+#
+def get_ladder_mode_using_crt(v, bit):
+
+    bit_value_p = (v.dp.number() & (1 << bit)) >> bit
+    bit_value_q = (v.dq.number() & (1 << bit)) >> bit
+
+    bit_value_p = bit_value_p > 0
+    bit_value_q = bit_value_q > 0
+
+    return (bit_value_p, bit_value_q)
+
+#
+# print current exponentiation progress
+#
+def print_ladder_progress(current, total):
+
+    # this will always print "100.0%" at the very last iteration, since we're
+    # counting bits from msb to lsb and the very last index is zero, which
+    # is congruent to 0 mod DUMP_PROGRESS_FACTOR
+    if (current % DUMP_PROGRESS_FACTOR) == 0:
+        pct = float((_WORD_WIDTH * total - current) / (_WORD_WIDTH * total)) * 100.0
+        print("\rdone: %5.1f%%" % pct, end='')
+
+    # move to next line after the very last iteration
+    if current == 0: print("")
+
+#
+# try to exponentiate using the quad-multiplier (dual-core, dual-ladder) scheme
+#
+def sign_using_crt():
+
+    c  = core
+    v  = vector
+    n  = n_num_words
+    pq = pq_num_words
+
+    ff = (False, False)
+                                                                   #
+                                                                   # A / B => different content in banks (A in WIDE, B in NARROW)
+                                                                   # [XY]Z => different content in ladders (XZ in X, YZ in Y)
+                                                                   # ..    => temporarily half-filled bank (omitted to save space)
+                                                                   # *     => "crossed" content (X.Y == Y.X and Y.Y == X.X)
+                                                                   #
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+                                                                   # |  A                     |  B    |  C               |  D      |  E        |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_wide_from_input   (c.bnk.crt_x, W.N,       I.N)          # |  ?                     |  ?    |  ?               |  ?      | ?         |
+    c.set_wide_from_input   (c.bnk.crt_y, W.N,       I.N)          # |  ?                     |  ?    |  ?               |  ?      | ?         |
+    c.set_wide_from_input   (c.bnk.crt_x, W.A,       I.X)          # |  ..                    |  ?    |  ?               |  ?      | ?         |
+    c.set_wide_from_input   (c.bnk.crt_y, W.A,       I.Y)          # | [XY] / ?               |  ?    |  ?               |  ?      | ?         |
+    c.set_wide_from_input   (c.bnk.crt_x, W.E,       I.M)          # | [XY] / ?               |  ?    |  ?               |  ?      | .. / ?    |
+    c.set_wide_from_input   (c.bnk.crt_y, W.E,       I.M)          # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_narrow_from_input (c.bnk.crt_x, N.N_COEFF, I.N_COEFF)    # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
+    c.set_narrow_from_input (c.bnk.crt_y, N.N_COEFF, I.N_COEFF)    # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
+    c.set_narrow_from_input (c.bnk.crt_x, N.A,       I.N_FACTOR)   # | [XY] / ..              |  ?    |  ?               |  ?      | M  / ?    |
+    c.set_narrow_from_input (c.bnk.crt_y, N.A,       I.N_FACTOR)   # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M  / ?    |
+    c.set_narrow_from_input (c.bnk.crt_x, N.E,       I.M)          # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M  / ..   |
+    c.set_narrow_from_input (c.bnk.crt_y, N.E,       I.M)          # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.A, N.A, W.B, N.B, n)                      # | [XY] / N_FACTOR        | [XY]F |  ?               |  ?      | M         | [XY]F = [XY] * N_FACTOR
+    c.modular_multiply(W.B, N.B, W.C, N.C, n, mode=ff)             # | [XY] / N_FACTOR        | [XY]F | [XY]YM           |  ?      | M         | [XY]MF = [XY]F * [XY]F
+    c.modular_multiply(W.C, N.I, W.D, N.D, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         | [XY]M = [XY]MF * 1
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.D, n_num_words)                          # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_output_from_narrow(O.XM, c.bnk.crt_x, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
+    c.set_output_from_narrow(O.YM, c.bnk.crt_y, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.E, N.B, W.C, N.C, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         | [XY]MB = M*[XY]F
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.C, n_num_words)                          # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.copy_crt_y2x(W.C, N.C)                                       # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_wide_from_input  (c.bnk.crt_x, W.N,       I.P)           # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
+    c.set_wide_from_input  (c.bnk.crt_y, W.N,       I.Q)           # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
+    c.set_wide_from_input  (c.bnk.crt_x, W.A,       I.P_FACTOR)    # | ...         / N_FACTOR | [XY]F |  YMB             | [XY]M   | M         |
+    c.set_wide_from_input  (c.bnk.crt_y, W.A,       I.Q_FACTOR)    # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | M         |
+    c.set_wide_from_input  (c.bnk.crt_x, W.E,       I.QINV)        # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | ..        |
+    c.set_wide_from_input  (c.bnk.crt_x, W.E,       I.QINV)        # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_narrow_from_input(c.bnk.crt_x, N.N_COEFF, I.P_COEFF)     # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
+    c.set_narrow_from_input(c.bnk.crt_y, N.N_COEFF, I.Q_COEFF)     # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
+    c.set_narrow_from_input(c.bnk.crt_x, N.A,       I.P_FACTOR)    # | [PQ]_FACTOR / ...      | [XY]F |  YMB             | [XY]M   | QINV / M  |
+    c.set_narrow_from_input(c.bnk.crt_y, N.A,       I.Q_FACTOR)    # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV / M  |
+    c.set_narrow_from_input(c.bnk.crt_x, N.E,       I.QINV)        # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV / .. |
+    c.set_narrow_from_input(c.bnk.crt_x, N.E,       I.QINV)        # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV      |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_reduce(N.C, W.D, N.D, pq)                            # | [PQ]_FACTOR            | [XY]F |  YMB             | [PQ]MBZ | QINV      | [PQ]MBZ = YMB mod [PQ]
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.D, N.A, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBZ | QINV      | [PQ]MB = [PQ]MBZ * [PQ]_FACTOR
+    c.modular_multiply(W.C, N.A, W.D, N.D, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBF | QINV      | [PQ]MBF = [PQ]MB * [PQ]_FACTOR
+    c.modular_multiply(W.A, N.I, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]IF           | [PQ]MBF | QINV      | [PQ]IF = 1 * [PQ]_FACTOR
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.copy_ladders_x2y(W.D, N.D, W.C, N.C)                         # | [PQ]_FACTOR            | [XY]F | [PQ]IF / [PQ]MBF | [PQ]MBF | QINV      |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    ###########################                                    # |                        |       |                  |         |           |
+    # Begin Montgomery Ladder #                                    # |                        |       |                  |         |           |
+    ###########################                                    # |                        |       |                  |         |           |
+                                                                   # |                        |       |                  |         |           |
+    for bit in range(_WORD_WIDTH * pq - 1, -1, -1):                # |                        |       |                  |         |           |
+                                                                   # |                        |       |                  |         |           |
+        m  = get_ladder_mode_using_crt(v, bit)                     # |                        |       |                  |         |           |
+        dbg = bit == DUMP_LADDER_INDEX                             # |                        |       |                  |         |           |
+                                                                   # |                        |       |                  |         |           |
+        if dbg:                                                    # |                        |       |                  |         |           |
+            if FORCE_OVERFLOW: c._force_overflow(c.bnk.crt_x, N.C) # |                        |       |                  |         |           |
+            if DUMP_VECTORS: c.dump_before_step_crt(pq, m)         # |                        |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+        c.modular_multiply(W.C, N.C, W.C, N.C, pq, mode=m, d=dbg)  # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]MBF | QINV      | <LADDER>
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+        if dbg and DUMP_VECTORS: c.dump_after_step_crt()           # |                        |       |                  |         |           |
+        print_ladder_progress(bit, pq)                             # |                        |       |                  |         |           |
+                                                                   # |                        |       |                  |         |           |
+    #########################                                      # |                        |       |                  |         |           |
+    # End Montgomery Ladder #                                      # |                        |       |                  |         |           |
+    #########################                                      # |                        |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.C, N.I, W.D, N.D, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB  | QINV      | [PQ]SB = [PQ]SBF * 1
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.D, pq)                                   # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB  | QINV      |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.cross_ladders_x2y(W.D, N.D, W.D, N.D)                        # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB* | QINV      |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_subtract(N.D, N.C, W.C, pq)                          # | [PQ]_FACTOR            | [XY]F |  RSB             | [PQ]SB* | QINV      | RSB = PSB - QSB
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.C, N.E, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F |  RSBIZ           | [PQ]SB* | QINV      | RSBIZ = RSB * QINV
+    c.modular_multiply(W.C, N.A, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | QINV      | RSBI = RSBIZ * P_FACTOR
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_wide_from_input  (c.bnk.crt_x, W.E, I.Q)                 # | [PQ]_FACTOR / N_FACTOR | [XY]F |  RSBI            | [PQ]SB* | ..        |
+    c.set_wide_from_input  (c.bnk.crt_x, W.E, I.Q)                 # | [PQ]_FACTOR / N_FACTOR | [XY]F |  RSBI            | [PQ]SB* | Q / QINV  |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_narrow_from_input(c.bnk.crt_x, N.E, I.Q)                 # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q / ..    |
+    c.set_narrow_from_input(c.bnk.crt_x, N.E, I.Q)                 # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.regular_multiply(W.E, N.C, pq)                               # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q         | = RSBI * Q
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.merge_lha(N.A, pq)                                           # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            | [PQ]SB* | Q         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.A, n)                                    # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            | [PQ]SB* | Q         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.copy_crt_y2x(W.D, N.D)                                       # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            |  QSB*   | Q         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.regular_add(N.D, N.A, N.C, pq)                               # | [PQ]_FACTOR / QRSBI    | [XY]F |  SB              |  QSB*   | Q         | SB = QSB + RSBI
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_wide_from_input  (c.bnk.crt_x, W.N, I.N)                 # |                        |       |                  |         |           |
+    c.set_wide_from_input  (c.bnk.crt_y, W.N, I.N)                 # |                        |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_narrow_from_input(c.bnk.crt_x, N.N_COEFF, I.N_COEFF)     # |                        |       |                  |         |           |
+    c.set_narrow_from_input(c.bnk.crt_y, N.N_COEFF, I.N_COEFF)     # |                        |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.B, N.C, W.A, N.A, n, ff)                  # |  S                     |       |                  |         |           | S = XF * SB
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.A, n)                                    # |  S                     |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_output_from_narrow(O.S, c.bnk.crt_x, N.A)                # |  S                     |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+#
+# main()
+#
 if __name__ == "__main__":
 
+    # handy shortcuts
+    W = ModExpNG_WideBankEnum
+    N = ModExpNG_NarrowBankEnum
+    I = ModExpNG_CoreInputEnum
+    O = ModExpNG_CoreOutputEnum
+
+    # set helper quantity
+    # instantiate core
     # load test vector
-    # create worker
+    # transfer numbers from vector to core
     # set numbers of words
     # obtain known good reference value with built-in math
-    # create helper quantity
     # mutate blinding quantities with built-in math
 
-    n_num_words  = KEY_LENGTH  // _WORD_WIDTH
-    pq_num_words = n_num_words // 2
-
     i = ModExpNG_Operand(1, KEY_LENGTH)
 
-    vector = ModExpNG_TestVector()
     core   = ModExpNG_Core(i)
-    
+    vector = ModExpNG_TestVector()
+
+    core.inp.set_value(I.M,        vector.m)
+
+    core.inp.set_value(I.N,        vector.n)
+    core.inp.set_value(I.P,        vector.p)
+    core.inp.set_value(I.Q,        vector.q)
+
+    core.inp.set_value(I.N_COEFF,  vector.n_coeff)
+    core.inp.set_value(I.P_COEFF,  vector.p_coeff)
+    core.inp.set_value(I.Q_COEFF,  vector.q_coeff)
+
+    core.inp.set_value(I.N_FACTOR, vector.n_factor)
+    core.inp.set_value(I.P_FACTOR, vector.p_factor)
+    core.inp.set_value(I.Q_FACTOR, vector.q_factor)
+
+    core.inp.set_value(I.X,        vector.x)
+    core.inp.set_value(I.Y,        vector.y)
+
+    core.inp.set_value(I.QINV,     vector.qinv)
+
+    n_num_words  = KEY_LENGTH  // _WORD_WIDTH
+    pq_num_words = n_num_words // 2
+
     s_known = pow(vector.m.number(), vector.d.number(), vector.n.number())
 
-    x_mutated_known = pow(vector.x.number(), 2, vector.n.number())
-    y_mutated_known = pow(vector.y.number(), 2, vector.n.number())
+    xm_known = pow(vector.x.number(), 2, vector.n.number())
+    ym_known = pow(vector.y.number(), 2, vector.n.number())
+
+    # sign using CRT
+    print("Signing using CRT...")
+    sign_using_crt()
+    compare_signature()
+
+    # sign without CRT
+    # ...
+
+
+#
+# End-of-File
+#
+
+
 
-    
     # bring one into Montgomery domain (glue 2**r to one)
     # bring blinding coefficients into Montgomery domain (glue 2**(2*r) to x and y)
     # blind message
@@ -1329,129 +1659,3 @@ if __name__ == "__main__":
     # unblind s
     # mutate blinding factors
 
-    W = ModExpNG_WideBankEnum
-    N = ModExpNG_NarrowBankEnum
-    O = ModExpNG_CoreOutputEnum
-    
-    core.bnk.crt_x.set_modulus(vector.n, vector.n_coeff)
-    core.bnk.crt_y.set_modulus(vector.n, vector.n_coeff)
-    
-    core.bnk.crt_x.set_operands_crt_xy(W.A, N.A, vector.x, vector.n_factor)
-    core.bnk.crt_y.set_operands_crt_xy(W.A, N.A, vector.y, vector.n_factor)
-
-    core.bnk.crt_x.set_operands_crt_xy(W.E, N.E, vector.m, vector.m)
-    core.bnk.crt_y.set_operands_crt_xy(W.E, N.E, vector.m, vector.m)
-                                                                                   #                             | A               | B     | C                | D       | E |
-                                                                                   #                             +-----------------+-------+------------------+---------+---+
-                                                                                   #                             | [XY] ; N_FACTOR | ?     | ?                | ?       | M |
-    core.modular_multiply(W.A, N.A, W.B, N.B, n_num_words)                         # [XY]F  =[XY]*N_FACTOR       | [XY] ; N_FACTOR | [XY]F | ?                | ?       | M |
-    core.modular_multiply(W.B, N.B, W.C, N.C, n_num_words, mode=(False, False))    # [XY]MF =[XY]F*[XY]F         | [XY] ; N_FACTOR | [XY]F | [XY]YM           | ?       | M |
-    core.modular_multiply(W.C, N.I, W.D, N.D, n_num_words)                         # [XY]M  =[XY]MF*1            | [XY] ; N_FACTOR | [XY]F | [XY]YM           | [XY]M   | M |
-    core.reduce_narrow(N.D, n_num_words)                                           #                             |                 |       |                  |         |   |
-    core.set_output(O.XM, core.bnk.crt_x, N.D)                                     #                             |                 |       |                  |         |   |
-    core.set_output(O.YM, core.bnk.crt_y, N.D)                                     #                             |                 |       |                  |         |   |
-    core.modular_multiply(W.E, N.B, W.C, N.C, n_num_words)                         # [XY]MB =M*[XY]F             | [XY] ; N_FACTOR | [XY]F | [XY]MB           | [XY]M   | M |
-    core.move_crt_y2x(W.C, N.C)                                                    #                             | [XY] ; N_FACTOR | [XY]F | YMB              | [XY]M   | M |
-    core.bnk.crt_x.set_modulus(vector.p, vector.p_coeff)                           #                             |                 |       |                  |         |   |
-    core.bnk.crt_y.set_modulus(vector.q, vector.q_coeff)                           #                             |                 |       |                  |         |   |
-    core.bnk.crt_x.set_operands_crt_xy(W.A, N.A, vector.p_factor, vector.p_factor) #                             | [PQ]_FACTOR     | [XY]F | YMB              | [XY]M   | M |
-    core.bnk.crt_y.set_operands_crt_xy(W.A, N.A, vector.q_factor, vector.q_factor) #                             | [PQ]_FACTOR     | [XY]F | YMB              | [XY]M   | M |
-    core.reduce_narrow(N.C, n_num_words)                                           #                             |                 |       |                  |         |   |
-    core.modular_reduce(N.C, W.D, N.D, pq_num_words)                               #                             | [PQ]_FACTOR     | [XY]F | YMB              | [PQ]MBZ | M |
-    core.modular_multiply(W.D, N.A, W.C, N.C, pq_num_words)                        # [PQ]MB =[PQ]MBZ*[PQ]_FACTOR | [PQ]_FACTOR     | [XY]F | [PQ]MB           | [PQ]MBZ | M |
-    core.modular_multiply(W.C, N.A, W.D, N.D, pq_num_words)                        # [PQ]MBF=[PQ]MB*[PQ]_FACTOR  | [PQ]_FACTOR     | [XY]F | [PQ]MB           | [PQ]MBF | M |
-    core.modular_multiply(W.A, N.I, W.C, N.C, pq_num_words)                        # [PQ]MBF=[PQ]MB*[PQ]_FACTOR  | [PQ]_FACTOR     | [XY]F | [PQ]IF           | [PQ]MBF | M |
-    core.move_ladders_x2y(W.D, N.D, W.C, N.C)                                      #                             | [PQ]_FACTOR     | [XY]F | [PQ]IF / [PQ]MBF | [PQ]MBF | M |
-    
-    #PIF = core.bnk.crt_x.ladder_x.narrow._get_value(N.C)#
-    #QIF = core.bnk.crt_y.ladder_x.narrow._get_value(N.C)#
-    
-
-
-
-########################
-
-    for bit in range(_WORD_WIDTH * pq_num_words - 1, -1, -1):
-    
-        bit_value_p = (vector.dp.number() & (1 << bit)) >> bit
-        bit_value_q = (vector.dq.number() & (1 << bit)) >> bit
-
-        bit_value_p = bit_value_p > 0
-        bit_value_q = bit_value_q > 0
-            
-        # mode = ... (shorted the next line for better readability)
-            
-        core.modular_multiply(W.C, N.C, W.C, N.C, pq_num_words, mode=(bit_value_p, bit_value_q))    # <LADDER>   | [PQ]_FACTOR     | [XY]F | [PQ]SBF          | [PQ]MBF | M |
-        
-        if (bit % 4) == 0:
-            pct = float((_WORD_WIDTH * pq_num_words - bit) / (_WORD_WIDTH * pq_num_words)) * 100.0
-            print("\rdone: %5.1f%%" % pct, end='')
-        
-    print("")
-
-    core.modular_multiply(W.C, N.I, W.D, N.D, pq_num_words)                        # [PQ]SB=[PQ]SBF*1            | [PQ]_FACTOR     | [XY]F | [PQ]SBF          | [PQ]SB  | M |
-
-############################
-    
-    core.reduce_narrow(N.D, pq_num_words)
-    
-    #SQB = core.bnk.crt_y.ladder_x.narrow._get_value(N.D)
-    
-    core.flip_ladder_y2x(W.D, N.D, W.D, N.D)
-    core.modular_subtract(N.D, N.C, W.C, pq_num_words)                     #                             | [PQ]_FACTOR     | [XY]F | RSB              | [PQ]SB  | M    |
-    core.bnk.crt_x.set_operands_crt_xy(W.E, N.E, vector.qinv, vector.qinv) #                             | [PQ]_FACTOR     | [XY]F | RSB              | [PQ]SB  | QINV |
-    core.bnk.crt_y.set_operands_crt_xy(W.E, N.E, vector.qinv, vector.qinv) #                             | [PQ]_FACTOR     | [XY]F | RSB              | [PQ]SB  | QINV |
-
-    core.modular_multiply(W.C, N.E, W.C, N.C, pq_num_words)                #                             | [PQ]_FACTOR     | [XY]F | RSBIZ            | [PQ]SB  | QINV |
-    core.modular_multiply(W.C, N.A, W.C, N.C, pq_num_words)                #                             | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | QINV |
-    
-    core.bnk.crt_x.set_operands_crt_xy(W.E, N.E, vector.q, vector.q) #                             | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | Q |
-    core.bnk.crt_y.set_operands_crt_xy(W.E, N.E, vector.q, vector.q) #                             | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | Q |
-    
-    core.just_multiply(W.E, N.C, pq_num_words)                            # | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | Q |
-
-    core.merge_lha(N.A, pq_num_words)
-    
-    core.reduce_narrow(N.A, n_num_words)
-
-    core.move_crt_y2x(W.D, N.D)
-    
-    #RQSBI = core.bnk.crt_x.ladder_x.narrow._get_value(N.A)
-    
-    core.just_add(N.D, N.A, N.C, pq_num_words)   # 
-    SB = core.bnk.crt_x.ladder_x._get_narrow(N.C)
-    #print(hex(SB.number()))
-
-    #SB = core.wrk.add(SQB, RQSBI, pq_num_words) # just_add
-    #print(hex(SB.number()))
-    
-    
-    # check why multiplication is not commutative!?
-    
-    
-    XF = core.bnk.crt_x.ladder_x.wide._get_value(W.B)
-
-    core.bnk.crt_x.set_modulus(vector.n, vector.n_coeff)
-    core.bnk.crt_y.set_modulus(vector.n, vector.n_coeff)
-
-    core.modular_multiply(W.B, N.C, W.A, N.A, n_num_words, mode=(False, False))
-    core.reduce_narrow(N.A, n_num_words)    
-    core.set_output(O.S, core.bnk.crt_x, N.A)
-    
-    S  = core.out.get_value(O.S)
-    XM = core.out.get_value(O.XM)
-    YM = core.out.get_value(O.YM)
-    
-    if S.number() != s_known: print("ERROR: s_crt_unblinded != s_known!")
-    else:                     print("s is OK")
-
-    if XM.number() != x_mutated_known: print("ERROR: x_mutated != x_mutated_known!")
-    else:                              print("x_mutated is OK")
-
-    if YM.number() != y_mutated_known: print("ERROR: y_mutated != y_mutated_known!")
-    else:                              print("y_mutated is OK")
-
-
-#
-# End-of-File
-#


From git at cryptech.is  Mon Aug 19 11:07:14 2019
From: git at cryptech.is (git at cryptech.is)
Date: Mon, 19 Aug 2019 11:07:14 +0000
Subject: [Cryptech-Commits] [user/shatov/modexpng] 12/12: * More cleanup
 (got rid of .wide. and .narrow.)
In-Reply-To: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
References: <156621282292.96240.10854263252293302403@bikeshed.cryptech.is>
Message-ID: <20190819110713.DB33B991D76@bikeshed.cryptech.is>

This is an automated email from the git hooks/post-receive script.

meisterpaul1 at yandex.ru pushed a commit to branch master
in repository user/shatov/modexpng.

commit 0beee226e63b3a62ba32bc588e40eaeef01eac2b
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Aug 19 14:04:27 2019 +0300

    * More cleanup (got rid of .wide. and .narrow.)
    
    * Working microcode for non-CRT exponentiation (i.e. when only d is known)
---
 modexpng_fpga_model.py | 254 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 169 insertions(+), 85 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index 71a4b91..325f544 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -1085,7 +1085,7 @@ class ModExpNG_Core():
     # copy from CRT_(X|Y).LADDER_X.NARROW to OUTPUT
     #
     def set_output_from_narrow(self, sel_output, bank_crt, sel_narrow):
-        self.out._set_value(sel_output, bank_crt.ladder_x.narrow._get_value(sel_narrow))
+        self.out._set_value(sel_output, bank_crt.ladder_x._get_narrow(sel_narrow))
 
     #
     # copy from INPUT to CRT_(X|Y).LADDER_{X,Y}.NARROW
@@ -1102,7 +1102,7 @@ class ModExpNG_Core():
         bank_crt.ladder_y._set_wide(sel_wide, self.inp._get_value(sel_input))
 
     #
-    # copy from CRT_Y.LADDER_{X,Y).{WIDE,NARROW} to CRT_X.LADDER_{X,Y}.{WIDE,NARROW}
+    # copy from CRT_Y.LADDER_{X,Y}.{WIDE,NARROW} to CRT_X.LADDER_{X,Y}.{WIDE,NARROW}
     #
     def copy_crt_y2x(self, sel_wide, sel_narrow):
 
@@ -1123,6 +1123,17 @@ class ModExpNG_Core():
         self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in))
         self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in))
 
+    #
+    # copy from CRT_{X,Y}.LADDER_Y.{WIDE,NARROW} to CRT_{X,Y}.LADDER_X.{WIDE,NARROW}
+    #
+    def copy_ladders_y2x(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
+
+        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, self.bnk.crt_x.ladder_y._get_wide(sel_wide_in))
+        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, self.bnk.crt_y.ladder_y._get_wide(sel_wide_in))
+
+        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_in))
+        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_in))
+
     #
     # copy from CRT_{X,Y}.LADDER_X.{WIDE,NARROW} to CRT_{Y,X}.LADDER_Y.{WIDE,NARROW}
     #
@@ -1143,23 +1154,23 @@ class ModExpNG_Core():
     #
     def modular_multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True), d=False):
 
-        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        xn       = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
+        yn       = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
 
-        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        xn_coeff = self.bnk.crt_x.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
+        yn_coeff = self.bnk.crt_y.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
 
-        xxa       = self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in)
-        xya       = self.bnk.crt_x.ladder_y.wide._get_value(sel_wide_in)
+        xxa       = self.bnk.crt_x.ladder_x._get_wide(sel_wide_in)
+        xya       = self.bnk.crt_x.ladder_y._get_wide(sel_wide_in)
 
-        yxa       = self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in)
-        yya       = self.bnk.crt_y.ladder_y.wide._get_value(sel_wide_in)
+        yxa       = self.bnk.crt_y.ladder_x._get_wide(sel_wide_in)
+        yya       = self.bnk.crt_y.ladder_y._get_wide(sel_wide_in)
 
-        xxb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
-        xyb       = self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow_in)
+        xxb       = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
+        xyb       = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_in)
 
-        yxb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
-        yyb       = self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow_in)
+        yxb       = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)
+        yyb       = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_in)
 
         if not mode[0]: xb = xxb
         else:           xb = xyb
@@ -1189,22 +1200,22 @@ class ModExpNG_Core():
     #
     def modular_subtract(self, sel_narrow_in, sel_narrow_out, sel_wide_out, num_words):
 
-        xa = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
-        xb = self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow_in)
-        xn = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        xa = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
+        xb = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_in)
+        xn = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
 
-        ya = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
-        yb = self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow_in)
-        yn = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        ya = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)
+        yb = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_in)
+        yn = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
 
         xd = self.wrk.serial_subtract_modular(xa, xb, xn, num_words)
         yd = self.wrk.serial_subtract_modular(ya, yb, yn, num_words)
 
-        self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xd)
-        self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yd)
+        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xd)
+        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yd)
 
-        self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xd)
-        self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yd)
+        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xd)
+        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yd)
     
     #
     # modular reduce sel_narrow_in
@@ -1212,27 +1223,27 @@ class ModExpNG_Core():
     #
     def modular_reduce(self, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words):
 
-        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        xn       = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
+        yn       = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
 
-        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        xn_coeff = self.bnk.crt_x.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
+        yn_coeff = self.bnk.crt_y.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
 
-        xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
-        yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
+        xb       = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
+        yb       = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)
 
         xp = self.wrk.multipurpose_multiply(None, xb, xn, xn_coeff, num_words, reduce_only=True)
         yp = self.wrk.multipurpose_multiply(None, yb, yn, yn_coeff, num_words, reduce_only=True)
 
-        self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xp)
-        self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, xp)
-        self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yp)
-        self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, yp)
+        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xp)
+        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, xp)
+        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yp)
+        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, yp)
 
-        self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xp)
-        self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, xp)
-        self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yp)
-        self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yp)
+        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xp)
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xp)
+        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yp)
+        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yp)
 
     #
     # propagate carries (convert to non-redundant representation) content in sel_narrow
@@ -1274,20 +1285,20 @@ class ModExpNG_Core():
     #
     def regular_multiply(self, sel_wide_in, sel_narrow_in, num_words):
 
-        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        xn       = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
+        yn       = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
 
-        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        xn_coeff = self.bnk.crt_x.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
+        yn_coeff = self.bnk.crt_y.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
 
-        xxa       = self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in)
-        xya       = self.bnk.crt_x.ladder_y.wide._get_value(sel_wide_in)
+        xxa       = self.bnk.crt_x.ladder_x._get_wide(sel_wide_in)
+        xya       = self.bnk.crt_x.ladder_y._get_wide(sel_wide_in)
 
-        yxa       = self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in)
-        yya       = self.bnk.crt_y.ladder_y.wide._get_value(sel_wide_in)
+        yxa       = self.bnk.crt_y.ladder_x._get_wide(sel_wide_in)
+        yya       = self.bnk.crt_y.ladder_y._get_wide(sel_wide_in)
 
-        xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
-        yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
+        xb       = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
+        yb       = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)
 
         xxp = self.wrk.multipurpose_multiply(xxa, xb, None, None, num_words, multiply_only=True)
         xyp = self.wrk.multipurpose_multiply(xya, xb, None, None, num_words, multiply_only=True)
@@ -1307,15 +1318,15 @@ class ModExpNG_Core():
         yyp_lsb = yyp.lower_half()
         yyp_msb = yyp.upper_half()
 
-        self.bnk.crt_x.ladder_x.wide._set_value(ModExpNG_WideBankEnum.L, xxp_lsb)
-        self.bnk.crt_x.ladder_y.wide._set_value(ModExpNG_WideBankEnum.L, xyp_lsb)
-        self.bnk.crt_y.ladder_x.wide._set_value(ModExpNG_WideBankEnum.L, yxp_lsb)
-        self.bnk.crt_y.ladder_y.wide._set_value(ModExpNG_WideBankEnum.L, yyp_lsb)
+        self.bnk.crt_x.ladder_x._set_wide(ModExpNG_WideBankEnum.L, xxp_lsb)
+        self.bnk.crt_x.ladder_y._set_wide(ModExpNG_WideBankEnum.L, xyp_lsb)
+        self.bnk.crt_y.ladder_x._set_wide(ModExpNG_WideBankEnum.L, yxp_lsb)
+        self.bnk.crt_y.ladder_y._set_wide(ModExpNG_WideBankEnum.L, yyp_lsb)
 
-        self.bnk.crt_x.ladder_x.wide._set_value(ModExpNG_WideBankEnum.H, xxp_msb)
-        self.bnk.crt_x.ladder_y.wide._set_value(ModExpNG_WideBankEnum.H, xyp_msb)
-        self.bnk.crt_y.ladder_x.wide._set_value(ModExpNG_WideBankEnum.H, yxp_msb)
-        self.bnk.crt_y.ladder_y.wide._set_value(ModExpNG_WideBankEnum.H, yyp_msb)
+        self.bnk.crt_x.ladder_x._set_wide(ModExpNG_WideBankEnum.H, xxp_msb)
+        self.bnk.crt_x.ladder_y._set_wide(ModExpNG_WideBankEnum.H, xyp_msb)
+        self.bnk.crt_y.ladder_x._set_wide(ModExpNG_WideBankEnum.H, yxp_msb)
+        self.bnk.crt_y.ladder_y._set_wide(ModExpNG_WideBankEnum.H, yyp_msb)
 
     #
     # adds sel_narrow_a_in to sel_narrow_b_in
@@ -1345,7 +1356,7 @@ class ModExpNG_Core():
     #
     # dump working variables before ladder step
     #
-    def dump_before_step_crt(self, pq, m):
+    def dump_before_step_using_crt(self, pq, m):
         print("num_words = %d" % pq)
         print("\rladder_mode_x = %d" % m[0])
         print("\rladder_mode_y = %d" % m[1])
@@ -1361,7 +1372,7 @@ class ModExpNG_Core():
     #
     # dump working variables after ladder step
     #
-    def dump_after_step_crt(self):
+    def dump_after_step_using_crt(self):
         self.bnk.crt_x.ladder_x._get_narrow(N.C).format_verilog_concat("X_X")
         self.bnk.crt_x.ladder_y._get_narrow(N.C).format_verilog_concat("X_Y")
         self.bnk.crt_y.ladder_x._get_narrow(N.C).format_verilog_concat("Y_X")
@@ -1429,6 +1440,17 @@ def get_ladder_mode_using_crt(v, bit):
 
     return (bit_value_p, bit_value_q)
 
+#
+# get current ladder mode based on private exponent's bit
+#
+def get_ladder_mode_without_crt(v, bit):
+
+    bit_value_d = (v.d.number() & (1 << bit)) >> bit
+
+    bit_value_d = bit_value_d > 0
+
+    return (not bit_value_d, bit_value_d)
+
 #
 # print current exponentiation progress
 #
@@ -1487,7 +1509,7 @@ def sign_using_crt():
     c.set_output_from_narrow(O.XM, c.bnk.crt_x, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
     c.set_output_from_narrow(O.YM, c.bnk.crt_y, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
                                                                    # +------------------------+-------+------------------+---------+-----------+
-    c.modular_multiply(W.E, N.B, W.C, N.C, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         | [XY]MB = M*[XY]F
+    c.modular_multiply(W.E, N.B, W.C, N.C, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         | [XY]MB = M * [XY]F
                                                                    # +------------------------+-------+------------------+---------+-----------+
     c.propagate_carries(N.C, n_num_words)                          # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         |
                                                                    # +------------------------+-------+------------------+---------+-----------+
@@ -1508,7 +1530,6 @@ def sign_using_crt():
     c.set_narrow_from_input(c.bnk.crt_x, N.E,       I.QINV)        # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV      |
                                                                    # +------------------------+-------+------------------+---------+-----------+
     c.modular_reduce(N.C, W.D, N.D, pq)                            # | [PQ]_FACTOR            | [XY]F |  YMB             | [PQ]MBZ | QINV      | [PQ]MBZ = YMB mod [PQ]
-                                                                   # +------------------------+-------+------------------+---------+-----------+
     c.modular_multiply(W.D, N.A, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBZ | QINV      | [PQ]MB = [PQ]MBZ * [PQ]_FACTOR
     c.modular_multiply(W.C, N.A, W.D, N.D, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBF | QINV      | [PQ]MBF = [PQ]MB * [PQ]_FACTOR
     c.modular_multiply(W.A, N.I, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]IF           | [PQ]MBF | QINV      | [PQ]IF = 1 * [PQ]_FACTOR
@@ -1526,11 +1547,11 @@ def sign_using_crt():
                                                                    # |                        |       |                  |         |           |
         if dbg:                                                    # |                        |       |                  |         |           |
             if FORCE_OVERFLOW: c._force_overflow(c.bnk.crt_x, N.C) # |                        |       |                  |         |           |
-            if DUMP_VECTORS: c.dump_before_step_crt(pq, m)         # |                        |       |                  |         |           |
+            if DUMP_VECTORS: c.dump_before_step_using_crt(pq, m)   # |                        |       |                  |         |           |
                                                                    # +------------------------+-------+------------------+---------+-----------+
         c.modular_multiply(W.C, N.C, W.C, N.C, pq, mode=m, d=dbg)  # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]MBF | QINV      | <LADDER>
                                                                    # +------------------------+-------+------------------+---------+-----------+
-        if dbg and DUMP_VECTORS: c.dump_after_step_crt()           # |                        |       |                  |         |           |
+        if dbg and DUMP_VECTORS: c.dump_after_step_using_crt()     # |                        |       |                  |         |           |
         print_ladder_progress(bit, pq)                             # |                        |       |                  |         |           |
                                                                    # |                        |       |                  |         |           |
     #########################                                      # |                        |       |                  |         |           |
@@ -1576,6 +1597,87 @@ def sign_using_crt():
                                                                    # +------------------------+-------+------------------+---------+-----------+
     c.set_output_from_narrow(O.S, c.bnk.crt_x, N.A)                # |  S                     |       |                  |         |           |
                                                                    # +------------------------+-------+------------------+---------+-----------+
+
+#
+# try to exponentiate using only half of the quad-multiplier (one dual-ladder core)
+#
+def sign_without_crt():
+
+    c  = core
+    v  = vector
+    n  = n_num_words
+
+    ff = (False, False)
+
+    c.set_wide_from_input   (c.bnk.crt_x, W.N,       I.N)
+    c.set_wide_from_input   (c.bnk.crt_y, W.N,       I.N)
+    c.set_wide_from_input   (c.bnk.crt_x, W.A,       I.X)
+    c.set_wide_from_input   (c.bnk.crt_y, W.A,       I.Y)
+    c.set_wide_from_input   (c.bnk.crt_x, W.E,       I.M)
+    c.set_wide_from_input   (c.bnk.crt_y, W.E,       I.M)
+
+    c.set_narrow_from_input (c.bnk.crt_x, N.N_COEFF, I.N_COEFF)
+    c.set_narrow_from_input (c.bnk.crt_y, N.N_COEFF, I.N_COEFF)
+    c.set_narrow_from_input (c.bnk.crt_x, N.A,       I.N_FACTOR)
+    c.set_narrow_from_input (c.bnk.crt_y, N.A,       I.N_FACTOR)
+    c.set_narrow_from_input (c.bnk.crt_x, N.E,       I.M)
+    c.set_narrow_from_input (c.bnk.crt_y, N.E,       I.M)
+
+    c.modular_multiply(W.A, N.A, W.B, N.B, n)           # [XY]F = [XY] * N_FACTOR
+    c.modular_multiply(W.B, N.B, W.C, N.C, n, mode=ff)  # [XY]MF = [XY]F * [XY]F
+    c.modular_multiply(W.C, N.I, W.D, N.D, n)           # [XY]M = [XY]MF * 1
+
+    c.propagate_carries(N.D, n)
+
+    c.set_output_from_narrow(O.XM, c.bnk.crt_x, N.D)
+    c.set_output_from_narrow(O.YM, c.bnk.crt_y, N.D)
+
+    c.modular_multiply(W.E, N.B, W.C, N.C, n)   # [XY]MB = M * [XY]F
+
+    XF = c.bnk.crt_x.ladder_x._get_narrow(N.B)
+
+    c.set_wide_from_input(c.bnk.crt_x, W.A, I.N_FACTOR)
+    c.set_wide_from_input(c.bnk.crt_y, W.A, I.N_FACTOR)
+
+    c.modular_multiply(W.C, N.A, W.D, N.D, n)   # MBF = MB * N_FACTOR
+    c.modular_multiply(W.A, N.I, W.C, N.C, n)   # IF = 1 * N_FACTOR    
+    
+    c.copy_ladders_x2y(W.D, N.D, W.C, N.C)
+
+    ###########################
+    # Begin Montgomery Ladder #
+    ###########################
+
+    for bit in range(_WORD_WIDTH * n - 1, -1, -1):
+
+        m  = get_ladder_mode_without_crt(v, bit)
+        dbg = bit == DUMP_LADDER_INDEX
+
+        if dbg:
+            if FORCE_OVERFLOW: c._force_overflow(c.bnk.crt_x, N.C)
+            if DUMP_VECTORS: c.dump_before_step_without_crt(n, m)
+
+        c.modular_multiply(W.C, N.C, W.C, N.C, n, mode=m, d=dbg)
+
+        if dbg and DUMP_VECTORS: c.dump_after_step_without_crt()
+        print_ladder_progress(bit, n)
+            
+    #########################
+    # End Montgomery Ladder #
+    #########################
+
+    c.cross_ladders_x2y(W.B, N.B, W.B, N.B)
+
+    c.modular_multiply(W.C, N.I, W.D, N.D, n)           # SB = SBF * 1    
+    c.modular_multiply(W.B, N.D, W.A, N.A, n, mode=ff)  # S = XF * SB
+
+    c.copy_ladders_y2x(W.A, N.A, W.B, N.B)
+    
+    c.propagate_carries(N.B, n)
+    
+    c.set_output_from_narrow(O.S, c.bnk.crt_y, N.B)
+
+
 #
 # main()
 #
@@ -1627,35 +1729,17 @@ if __name__ == "__main__":
     xm_known = pow(vector.x.number(), 2, vector.n.number())
     ym_known = pow(vector.y.number(), 2, vector.n.number())
 
-    # sign using CRT
+    # sign using CRT and check
     print("Signing using CRT...")
     sign_using_crt()
     compare_signature()
 
-    # sign without CRT
-    # ...
+    # sign without CRT and check
+    print("Signing without CRT...")
+    sign_without_crt()
+    compare_signature()
 
 
 #
 # End-of-File
 #
-
-
-
-    # bring one into Montgomery domain (glue 2**r to one)
-    # bring blinding coefficients into Montgomery domain (glue 2**(2*r) to x and y)
-    # blind message
-    # convert message to non-redundant representation
-    # first reduce message, this glues 2**-r to the message as a side effect
-    # unglue 2**-r from message by gluing 2**r to it to compensate
-    # bring message into Montgomery domain (glue 2**r to message)
-    # do "easier" exponentiations
-    # return "easier" parts from Montgomery domain (unglue 2**r from result)
-    # do the "Garner's formula" part
-    #  r = sp - sq mod p
-    #  sr_qinv = sr * qinv mod p
-    #  q_sr_qinv = q * sr_qinv
-    #  s_crt = sq + q_sr_qinv
-    # unblind s
-    # mutate blinding factors
-