[Cryptech-Commits] [core/math/modexpng] 21/92: * MASSIVE CLEANUP

git at cryptech.is git at cryptech.is
Sat Mar 14 18:19:00 UTC 2020


This is an automated email from the git hooks/post-receive script.

paul at psgd.org pushed a commit to branch master
in repository core/math/modexpng.

commit b0fb2639a4d00033e91256486dbb9673761993d7
Author: Pavel V. Shatov (Meister) <meisterpaul1 at yandex.ru>
AuthorDate: Mon Aug 19 14:01:28 2019 +0300

    * MASSIVE CLEANUP
    
    * All the data buses are now either 16 or 18 bits wide for consistency
    
    * More consistent naming of micro-operations
    
    * More debugging options (can specify which ladder iteration to dump)
---
 modexpng_fpga_model.py | 1144 ++++++++++++++++++++++++++++--------------------
 1 file changed, 674 insertions(+), 470 deletions(-)

diff --git a/modexpng_fpga_model.py b/modexpng_fpga_model.py
index f57c7b9..71a4b91 100644
--- a/modexpng_fpga_model.py
+++ b/modexpng_fpga_model.py
@@ -63,9 +63,13 @@ NUM_MULTS  = 8
 _KEY_LENGTH_HALF = KEY_LENGTH // 2
 
 # width of internal math pipeline
-_WORD_WIDTH = 16
+_WORD_WIDTH     = 16
 _WORD_WIDTH_EXT = 18
 
+_WORD_MASK     = 2 ** _WORD_WIDTH     - 1
+_WORD_MASK_EXT = 2 ** _WORD_WIDTH_EXT - 1
+_CARRY_MASK    = _WORD_MASK ^ _WORD_MASK_EXT
+
 # folder with test vector scripts
 _VECTOR_PATH = "/vector"
 
@@ -76,16 +80,17 @@ _VECTOR_CLASS = "Vector"
 # ------------------
 # Debugging Settings
 # ------------------
-FORCE_OVERFLOW = False
-DUMP_VECTORS = False
-DUMP_INDICES = False
-DUMP_MACS_INPUTS = False
-DUMP_MACS_CLEARING = False
-DUMP_MACS_ACCUMULATION = False
-DUMP_MULT_PARTS = False
-DUMP_RCMB = False
-DUMP_REDUCTION = False
-
+DUMP_LADDER_INDEX      = -1     # at which ladder step to print debug vector
+DUMP_VECTORS           = False  # print entire debug vector components
+DUMP_INDICES           = False  # print indices of words at MAC inputs
+DUMP_MACS_INPUTS       = False  # print MAC input words
+DUMP_MACS_CLEARING     = False  # print MAC clearing bitmaps
+DUMP_MACS_ACCUMULATION = False  # print MAC accumulators contents
+DUMP_MULT_PARTS        = False  # print multiplication output parts
+DUMP_RECOMBINATION     = False  # print recombination internals
+DUMP_REDUCTION         = False  # print reduction internals
+FORCE_OVERFLOW         = False  # force rarely seen internal overflow situation to verify how its handler works
+DUMP_PROGRESS_FACTOR   = 16     # once per how many ladder steps to update progress indicator
 
 #
 # Multi-Precision Integer
@@ -116,7 +121,7 @@ class ModExpNG_Operand():
             if i > 0:
                 if (i % 4) == 0: print("")
                 else:            print(" ", end='')
-            print("%s[%2d] = 18'h%05x;" % (name, i, self.words[i]), end='')
+            print("%s[%3d] = 18'h%05x;" % (name, i, self.words[i]), end='')
         print("")
 
     def _init_from_words(self, words, count):
@@ -127,7 +132,7 @@ class ModExpNG_Operand():
             if words[i] >= (2 ** (_WORD_WIDTH_EXT)):
                 raise Exception("Word is too large!")
 
-        self.words = words
+        self.words = list(words)
 
     def _init_from_number(self, number, length):
 
@@ -160,16 +165,16 @@ class ModExpNG_Operand():
             ret += word << shift
             shift += _WORD_WIDTH
         return ret
-        
+
     def _get_half(self, part):
         num_words = len(self.words)
         num_words_half = num_words // 2
         if not part: return ModExpNG_Operand(None, num_words_half, self.words[:num_words_half])
         else:        return ModExpNG_Operand(None, num_words_half, self.words[num_words_half:])
-        
+
     def lower_half(self):
         return self._get_half(False)
-        
+
     def upper_half(self):
         return self._get_half(True)
 
@@ -222,7 +227,7 @@ class ModExpNG_WideBankEnum(Enum):
     N   = auto()
     L   = auto()
     H   = auto()
-    
+
 class ModExpNG_NarrowBankEnum(Enum):
     A       = auto()
     B       = auto()
@@ -231,7 +236,32 @@ class ModExpNG_NarrowBankEnum(Enum):
     E       = auto()
     N_COEFF = auto()
     I       = auto()
-        
+
+class ModExpNG_CoreInputEnum(Enum):
+    M        = auto()
+
+    N        = auto()
+    P        = auto()
+    Q        = auto()
+
+    N_COEFF  = auto()
+    P_COEFF  = auto()
+    Q_COEFF  = auto()
+
+    N_FACTOR = auto()
+    P_FACTOR = auto()
+    Q_FACTOR = auto()
+
+    X        = auto()
+    Y        = auto()
+
+    QINV     = auto()
+
+class ModExpNG_CoreOutputEnum(Enum):
+    XM = auto()
+    YM = auto()
+    S  = auto()
+
 class ModExpNG_WideBank():
 
     def __init__(self):
@@ -243,7 +273,7 @@ class ModExpNG_WideBank():
         self.n = None
         self.l = None
         self.h = None
-    
+
     def _get_value(self, sel):
         if   sel == ModExpNG_WideBankEnum.A:   return self.a
         elif sel == ModExpNG_WideBankEnum.B:   return self.b
@@ -276,7 +306,7 @@ class ModExpNG_NarrowBank():
         self.e       = None
         self.n_coeff = None
         self.i       = i
-        
+
     def _get_value(self, sel):
         if   sel == ModExpNG_NarrowBankEnum.A:       return self.a
         elif sel == ModExpNG_NarrowBankEnum.B:       return self.b
@@ -296,21 +326,106 @@ class ModExpNG_NarrowBank():
         elif sel == ModExpNG_NarrowBankEnum.N_COEFF: self.n_coeff = value
         else: raise Exception("ModExpNG_NarrowBank._set_value(): Invalid selector!")
 
+class ModExpNG_CoreInput():
+
+    def __init__(self):
+        self._m        = None
+
+        self._n        = None
+        self._p        = None
+        self._q        = None
+
+        self._n_coeff  = None
+        self._p_coeff  = None
+        self._q_coeff  = None
+
+        self._n_factor = None
+        self._p_factor = None
+        self._q_factor = None
+
+        self._x        = None
+        self._y        = None
+
+        self._qinv     = None
+
+    def set_value(self, sel, value):
+        if   sel == ModExpNG_CoreInputEnum.M:        self._m        = value
+
+        elif sel == ModExpNG_CoreInputEnum.N:        self._n        = value
+        elif sel == ModExpNG_CoreInputEnum.P:        self._p        = value
+        elif sel == ModExpNG_CoreInputEnum.Q:        self._q        = value
+
+        elif sel == ModExpNG_CoreInputEnum.N_COEFF:  self._n_coeff  = value
+        elif sel == ModExpNG_CoreInputEnum.P_COEFF:  self._p_coeff  = value
+        elif sel == ModExpNG_CoreInputEnum.Q_COEFF:  self._q_coeff  = value
+
+        elif sel == ModExpNG_CoreInputEnum.N_FACTOR: self._n_factor = value
+        elif sel == ModExpNG_CoreInputEnum.P_FACTOR: self._p_factor = value
+        elif sel == ModExpNG_CoreInputEnum.Q_FACTOR: self._q_factor = value
+
+        elif sel == ModExpNG_CoreInputEnum.X:        self._x        = value
+        elif sel == ModExpNG_CoreInputEnum.Y:        self._y        = value
+
+        elif sel == ModExpNG_CoreInputEnum.QINV:     self._qinv     = value
+
+        else: raise Exception("ModExpNG_CoreInput.set_value(): invalid selector!")
+
+    def _get_value(self, sel):
+        if   sel == ModExpNG_CoreInputEnum.M:        return self._m
+
+        elif sel == ModExpNG_CoreInputEnum.N:        return self._n
+        elif sel == ModExpNG_CoreInputEnum.P:        return self._p
+        elif sel == ModExpNG_CoreInputEnum.Q:        return self._q
+
+        elif sel == ModExpNG_CoreInputEnum.N_COEFF:  return self._n_coeff
+        elif sel == ModExpNG_CoreInputEnum.P_COEFF:  return self._p_coeff
+        elif sel == ModExpNG_CoreInputEnum.Q_COEFF:  return self._q_coeff
+
+        elif sel == ModExpNG_CoreInputEnum.N_FACTOR: return self._n_factor
+        elif sel == ModExpNG_CoreInputEnum.P_FACTOR: return self._p_factor
+        elif sel == ModExpNG_CoreInputEnum.Q_FACTOR: return self._q_factor
+
+        elif sel == ModExpNG_CoreInputEnum.X:        return self._x
+        elif sel == ModExpNG_CoreInputEnum.Y:        return self._y
+
+        elif sel == ModExpNG_CoreInputEnum.QINV:     return self._qinv
+
+        else: raise Exception("ModExpNG_CoreInput._get_value(): invalid selector!")
+
+class ModExpNG_CoreOutput():
+
+    def __init__(self):
+        self._xm = None
+        self._ym = None
+        self._s  = None
+
+    def _set_value(self, sel, value):
+        if   sel == ModExpNG_CoreOutputEnum.XM: self._xm = value
+        elif sel == ModExpNG_CoreOutputEnum.YM: self._ym = value
+        elif sel == ModExpNG_CoreOutputEnum.S:  self._s  = value
+        else: raise Exception("ModExpNG_CoreOutput._set_value(): invalid selector!")
+
+    def get_value(self, sel):
+        if   sel == ModExpNG_CoreOutputEnum.XM: return self._xm
+        elif sel == ModExpNG_CoreOutputEnum.YM: return self._ym
+        elif sel == ModExpNG_CoreOutputEnum.S:  return self._s
+        else: raise Exception("ModExpNG_CoreOutput.get_value(): invalid selector!")
+
 class ModExpNG_BanksPair():
-    
+
     def __init__(self, i):
         self.wide = ModExpNG_WideBank()
         self.narrow = ModExpNG_NarrowBank(i)
-        
+
     def _get_wide(self, sel):
         return self.wide._get_value(sel)
 
     def _get_narrow(self, sel):
         return self.narrow._get_value(sel)
-        
+
     def _set_wide(self, sel, value):
         self.wide._set_value(sel, value)
-        
+
     def _set_narrow(self, sel, value):
         self.narrow._set_value(sel, value)
 
@@ -319,20 +434,6 @@ class ModExpNG_BanksLadder():
     def __init__(self, i):
         self.ladder_x = ModExpNG_BanksPair(i)
         self.ladder_y = ModExpNG_BanksPair(i)
-        
-    def set_modulus(self, n, n_coeff):
-        self.ladder_x.wide._set_value(ModExpNG_WideBankEnum.N, n)
-        self.ladder_y.wide._set_value(ModExpNG_WideBankEnum.N, n)
-        self.ladder_x.narrow._set_value(ModExpNG_NarrowBankEnum.N_COEFF, n_coeff)
-        self.ladder_y.narrow._set_value(ModExpNG_NarrowBankEnum.N_COEFF, n_coeff)
-        
-    def set_operands_crt_xy(self, sel_wide, sel_narrow, x, y):
-        if sel_wide is not None:
-            self.ladder_x.wide._set_value(sel_wide, x)
-            self.ladder_y.wide._set_value(sel_wide, x)
-        if sel_narrow is not None:
-            self.ladder_x.narrow._set_value(sel_narrow, y)
-            self.ladder_y.narrow._set_value(sel_narrow, y)
 
 class ModExpNG_BanksCRT():
 
@@ -350,7 +451,7 @@ class ModExpNG_PartRecombinator():
 
     def _flush_pipeline(self, dump):
         self.z0, self.y0, self.x0 = 0, 0, 0
-        if dump and DUMP_RCMB:
+        if dump and DUMP_RECOMBINATION:
             print("RCMB -> flush()")
 
     def _push_pipeline(self, part, dump):
@@ -366,13 +467,13 @@ class ModExpNG_PartRecombinator():
         x1 = x + self.y0 + (self.x0 >> _WORD_WIDTH) # IMPORTANT: This carry can be up to two bits wide!!
 
         # save lower 16 bits of the rightmost cell
-        t = self.x0 & 0xffff
+        t = self.x0 & _WORD_MASK
 
         # update internal latches
         self.z0, self.y0, self.x0 = z1, y1, x1
 
         # dump
-        if dump and DUMP_RCMB:
+        if dump and DUMP_RECOMBINATION:
             print("RCMB -> push(): part = 0x%012x, word = 0x%04x" % (part, t))
 
         # done
@@ -416,7 +517,7 @@ class ModExpNG_PartRecombinator():
         # merge upper half adding the two overlapping words
         for x in range(ab_num_words):
             next_word = words_msb[x]
-            if x < 2:                
+            if x < 2:
                 next_word += words_lsb[x + ab_num_words]
             words.append(next_word)
 
@@ -469,7 +570,7 @@ class ModExpNG_PartRecombinator():
 
             if i > 0:
                 words_msb.append(next_word)
-                
+
         # merge words
         words = list()
 
@@ -522,10 +623,10 @@ class ModExpNG_WordMultiplier():
 
     def _update_one_mac(self, x, t, col, a, b, dump, need_aux=False):
 
-        if a > 0x3FFFF:
+        if a >= (2 ** _WORD_WIDTH_EXT):
             raise Exception("a > 0x3FFFF!")
 
-        if b > 0xFFFF:
+        if b >= (2 ** _WORD_WIDTH):
             raise Exception("b > 0xFFFF!")
 
         p = a * b
@@ -534,20 +635,21 @@ class ModExpNG_WordMultiplier():
             if x > 0: print("; ", end='')
             print("MAC[%d]: a=%05x" % (x, a), end='')
             if x == (NUM_MULTS-1) and not need_aux: print("")
-            
+
         self._macs[x] += p
 
     def _update_mac_aux(self, y, col, a, b, dump):
-        
-        if a > 0x3FFFF:
+
+        if a >= (2 ** _WORD_WIDTH_EXT):
             raise Exception("a > 0x3FFFF!")
 
-        if b > 0xFFFF:
+        if b >= (2 ** _WORD_WIDTH):
             raise Exception("b > 0xFFFF!")
 
         p = a * b
         if dump and DUMP_MACS_INPUTS:
             print("; AUX: a=%05x" % a)
+            
         self._mac_aux[0] += p
 
     def _preset_indices(self, col):
@@ -617,9 +719,9 @@ class ModExpNG_WordMultiplier():
             parts.append(0)
 
         for col in range(num_cols):
-        
+
             b_carry = 0
-        
+
             for t in range(ab_num_words):
 
                 # take care of indices
@@ -638,10 +740,13 @@ class ModExpNG_WordMultiplier():
                 if dump and DUMP_INDICES: self._dump_indices(t, col)
 
                 # current b-word
-                # TODO: Explain how the 18th bit carry works!!
+                # multiplier's b-input is limited to 16-bit words, so we need to propagate
+                # carries on the fly here, carry can be up to two bits
                 bt = b_narrow.words[t] + b_carry
-                b_carry = (bt & 0x30000) >> 16
-                bt &= 0xFFFF
+                b_carry = (bt & _CARRY_MASK) >> _WORD_WIDTH
+                if dump and b_carry > 1:
+                    print("Rare overflow case was detected and then successfully corrected.")
+                bt &= _WORD_MASK
 
                 # multiply by a-words
                 for x in range(NUM_MULTS):
@@ -781,13 +886,8 @@ class ModExpNG_WordMultiplier():
 
 class ModExpNG_LowlevelOperator():
 
-    def __init__(self):
-        self._word_mask = 0
-        for x in range(_WORD_WIDTH):
-            self._word_mask |= (1 << x)
-
     def _check_word(self, a):
-        if a < 0 or a >= (2 ** _WORD_WIDTH):
+        if a < 0 or a > _WORD_MASK:
             raise Exception("Word out of range!")
 
     def _check_carry_borrow(self, cb):
@@ -802,13 +902,13 @@ class ModExpNG_LowlevelOperator():
 
         sum = a + b + c_in
 
-        sum_s = sum & self._word_mask
-        sum_c = (sum >> _WORD_WIDTH) & 1
+        sum_s = sum & _WORD_MASK
+        sum_c = sum >> _WORD_WIDTH
 
         return (sum_c, sum_s)
 
     def sub_words(self, a, b, b_in):
-    
+
         self._check_word(a)
         self._check_word(b)
         self._check_carry_borrow(b_in)
@@ -827,127 +927,44 @@ class ModExpNG_LowlevelOperator():
 class ModExpNG_Worker():
 
     def __init__(self):
-        self.recombinator = ModExpNG_PartRecombinator()
-        self.multiplier   = ModExpNG_WordMultiplier()
         self.lowlevel     = ModExpNG_LowlevelOperator()
+        self.multiplier   = ModExpNG_WordMultiplier()
+        self.recombinator = ModExpNG_PartRecombinator()
 
-    def exponentiate(self, iz, bz, e, n, n_factor, n_coeff, num_words, dump_index=-1, dump_mode=""):
-
-        # working variables
-        t1, t2 = iz, bz
-
-        # length-1, length-2, length-3, ..., 1, 0 (left-to-right)
-        for bit in range(_WORD_WIDTH * num_words - 1, -1, -1):
-
-            bit_value = (e.number() & (1 << bit)) >> bit
-
-            if bit > 500:
-                print("%s: bit=#%d (%d)" % (dump_mode, bit, bit_value))
-                print("")
-                print("%s_T1_BEFORE: %s" % (dump_mode, hex(t1.number())))
-                print("%s_T2_BEFORE: %s" % (dump_mode, hex(t2.number())))
-                print("")
-            else:
-                return None
-
-
-            debug_dump = bit == dump_index
-
-            
-            if debug_dump:
-                print("\rladder_mode = %d" % bit_value)
-                
-                if FORCE_OVERFLOW:
-                    T1X = list(t1.words)
-                    for i in range(num_words):
-                        if i > 0:
-                            bits = T1X[i-1] & (3 << 16)
-                            if bits == 0:
-                                bits = T1X[i] & 3
-                                T1X[i] = T1X[i] ^ bits
-                                T1X[i-1] |= (bits << 16)
-                                    
-                    for i in range(num_words):
-                        t1.words[i] = T1X[i]
-                
-                if DUMP_VECTORS:
-                    print("num_words = %d" % num_words)
-                    t1.format_verilog_concat("%s_T1" % dump_mode)
-                    t2.format_verilog_concat("%s_T2" % dump_mode)
-                    n.format_verilog_concat("%s_N" % dump_mode)
-                    n_coeff.format_verilog_concat("%s_N_COEFF"  % dump_mode)
-                            # force the rarely seen overflow
-
-            if bit_value:
-                p1 = self.multiply(t1, t2, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="X")
-                p2 = self.multiply(t2, t2, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="Y")
-            else:
-                p1 = self.multiply(t1, t1, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="X")
-                p2 = self.multiply(t2, t1, n, n_coeff, num_words, dump=debug_dump, dump_mode=dump_mode, dump_phase="Y")
-
-            t1, t2 = p1, p2
-
-            if debug_dump and DUMP_VECTORS:
-                t1.format_verilog_concat("%s_X" % dump_mode)
-                t2.format_verilog_concat("%s_Y" % dump_mode)
-
-            if (bit % 16) == 0:
-                pct = float((_WORD_WIDTH * num_words - bit) / (_WORD_WIDTH * num_words)) * 100.0
-                print("\rpct: %5.1f%%" % pct, end='')
-        
-        print("")
-
-        return t1
-
-    def subtract(self, a, b, n, ab_num_words):
-
+    def serial_subtract_modular(self, a, b, n, ab_num_words):
         c_in = 0
         b_in = 0
-
         ab = list()
         ab_n = list()
-
         for x in range(ab_num_words):
-
             a_word = a.words[x]
             b_word = b.words[x]
-
             (b_out, d_out) = self.lowlevel.sub_words(a_word, b_word, b_in)
             (c_out, s_out) = self.lowlevel.add_words(d_out, n.words[x], c_in)
-
             ab.append(d_out)
             ab_n.append(s_out)
-
             (c_in, b_in) = (c_out, b_out)
-
         d = ab if not b_out else ab_n
-
         return ModExpNG_Operand(None, ab_num_words, d)
 
-    def add(self, a, b, ab_num_words):
-
+    def serial_add_uneven(self, a, b, ab_num_words):
         c_in = 0
-
         ab = list()
-
         for x in range(2 * ab_num_words):
-
             a_word = a.words[x] if x < ab_num_words else 0
             b_word = b.words[x]
-
             (c_out, s_out) = self.lowlevel.add_words(a_word, b_word, c_in)
-
             ab.append(s_out)
-
             c_in = c_out
-
         return ModExpNG_Operand(None, 2*ab_num_words, ab)
 
-    def multiply(self, a, b, n, n_coeff, ab_num_words, reduce_only=False, multiply_only=False, dump=False, dump_mode="", dump_phase=""):
+    def multipurpose_multiply(self, a, b, n, n_coeff, ab_num_words, reduce_only=False, multiply_only=False, dump=False, dump_crt="", dump_ladder=""):
 
+        #
         # 1. AB = A * B
-        if dump: print("multiply_square(%s_%s)" % (dump_mode, dump_phase))
-        
+        #
+        if dump: print("multiply_square(%s_%s)" % (dump_crt, dump_ladder))
+
         if reduce_only:
             ab = b
         else:
@@ -956,169 +973,220 @@ class ModExpNG_Worker():
             ab = ModExpNG_Operand(None, 2 * ab_num_words, ab_words)
 
         if dump and DUMP_VECTORS:
-            ab.format_verilog_concat("%s_%s_AB" % (dump_mode, dump_phase))
+            ab.format_verilog_concat("%s_%s_AB" % (dump_crt, dump_ladder))
 
         if multiply_only:
             return ModExpNG_Operand(None, 2*ab_num_words, ab_words)
 
-            
+        #
         # 2. Q = LSB(AB) * N_COEFF
-        if dump: print("multiply_triangle(%s_%s)" % (dump_mode, dump_phase))
-        
+        #
+        if dump: print("multiply_triangle(%s_%s)" % (dump_crt, dump_ladder))
+
         q_parts = self.multiplier.multiply_triangle(ab, n_coeff, ab_num_words, dump)
         q_words = self.recombinator.recombine_triangle(q_parts, ab_num_words, dump)
         q = ModExpNG_Operand(None, ab_num_words + 1, q_words)
 
         if dump and DUMP_VECTORS:
-            q.format_verilog_concat("%s_%s_Q" % (dump_mode, dump_phase))
+            q.format_verilog_concat("%s_%s_Q" % (dump_crt, dump_ladder))
 
+        #
         # 3. M = Q * N
-        if dump: print("multiply_rectangle(%s_%s)" % (dump_mode, dump_phase))
-        
+        #
+        if dump: print("multiply_rectangle(%s_%s)" % (dump_crt, dump_ladder))
+
         m_parts = self.multiplier.multiply_rectangle(n, q, ab_num_words, dump)
         m_words = self.recombinator.recombine_rectangle(m_parts, ab_num_words, dump)
         m = ModExpNG_Operand(None, 2 * ab_num_words + 1, m_words)
-        
-        if dump and DUMP_VECTORS:
-            m.format_verilog_concat("%s_%s_M" % (dump_mode, dump_phase))
 
-        if (m.number() != (q.number() * n.number())):
-            print("MISMATCH")
-            sys.exit()
+        if dump and DUMP_VECTORS:
+            m.format_verilog_concat("%s_%s_M" % (dump_crt, dump_ladder))
 
-            
+        #
         # 4. R = AB + M
-        
+        #
+
+        #
         # 4a. compute carry (actual sum is all zeroes and need not be stored)
+        #
+        
         r_cy = 0 # this can be up to two bits, since we're adding extended words!!
         for i in range(ab_num_words + 1):
             s = ab.words[i] + m.words[i] + r_cy
-            r_cy_new = s >> 16
-            
+            r_cy_new = s >> _WORD_WIDTH
+
             if dump and DUMP_REDUCTION:
                 print("[%2d] 0x%05x + 0x%05x + 0x%x => {0x%x, [0x%05x]}" %
-                    (i, ab.words[i], m.words[i], r_cy, r_cy_new, s & 0xffff))
-                
+                    (i, ab.words[i], m.words[i], r_cy, r_cy_new, s & 0xffff))   # ???
+
             r_cy = r_cy_new
-        
-        
+
+
+        #
         # 4b. Initialize empty result
+        #
+        
         R = list()
         for i in range(ab_num_words):
             R.append(0)
 
+        #
         # 4c. compute the actual upper part of sum (take carry into account)
-        for i in range(ab_num_words):
+        #
         
+        for i in range(ab_num_words):
+
             if dump and DUMP_REDUCTION:
                 print("[%2d]" % i, end='')
-                
+
             ab_word = ab.words[ab_num_words + i + 1] if i < (ab_num_words - 1) else 0
             if dump and DUMP_REDUCTION:
                 print(" 0x%05x" % ab_word, end='')
-                
+
             m_word = m.words[ab_num_words + i + 1]
             if dump and DUMP_REDUCTION:
                 print(" + 0x%05x" % m_word, end='')
-                
+
             if i == 0: R[i] = r_cy
             else:      R[i] = 0
-            
-            if (r_cy > 3): print("\rR_CY = %d!" % r_cy)
-            
+
             if dump and DUMP_REDUCTION:
                 print(" + 0x%x" % R[i], end='')
-                
+
             R[i] += ab_word
             R[i] += m_word
             if dump and DUMP_REDUCTION:
                 print(" = 0x%05x" % R[i])
-                        
+
         return ModExpNG_Operand(None, ab_num_words, R)
-    
-    def reduce(self, a, num_words, carry_in=0):
-        carry = carry_in
+
+    def convert_nonredundant(self, a, num_words):
+        carry = 0
         for x in range(num_words):
             a.words[x] += carry
-            carry = (a.words[x] >> _WORD_WIDTH) & 3
-            a.words[x] &= self.lowlevel._word_mask
+            carry = a.words[x] >> _WORD_WIDTH
+            a.words[x] &= _WORD_MASK
         return carry
 
-class ModExpNG_CoreOutputEnum(Enum):
-    XM = auto()
-    YM = auto()
-    S  = auto()
-            
-class ModExpNG_CoreOutput():
-    
-    def __init__(self):
-        self._xm = None
-        self._ym = None
-        self._s  = None
-        
-    def _set_value(self, sel, value):
-        if   sel == ModExpNG_CoreOutputEnum.XM: self._xm = value
-        elif sel == ModExpNG_CoreOutputEnum.YM: self._ym = value
-        elif sel == ModExpNG_CoreOutputEnum.S:  self._s  = value
-        else: raise Exception("ModExpNG_CoreOutput._set_value(): invalid selector!")
-        
-    def get_value(self, sel):
-        if   sel == ModExpNG_CoreOutputEnum.XM: return self._xm
-        elif sel == ModExpNG_CoreOutputEnum.YM: return self._ym
-        elif sel == ModExpNG_CoreOutputEnum.S:  return self._s
-        else: raise Exception("ModExpNG_CoreOutput.get_value(): invalid selector!")
-            
 class ModExpNG_Core():
-    
+
     def __init__(self, i):
         self.wrk = ModExpNG_Worker()
         self.bnk = ModExpNG_BanksCRT(i)
+        self.inp = ModExpNG_CoreInput()
         self.out = ModExpNG_CoreOutput()
+
+    #
+    # CRT_(X|Y) means either CRT_X or CRT_Y
+    # LADDER_{X,Y} means both LADDER_X and LADDER_Y
+    #
+
+    #
+    # copy from CRT_(X|Y).LADDER_X.NARROW to OUTPUT
+    #
+    def set_output_from_narrow(self, sel_output, bank_crt, sel_narrow):
+        self.out._set_value(sel_output, bank_crt.ladder_x.narrow._get_value(sel_narrow))
+
+    #
+    # copy from INPUT to CRT_(X|Y).LADDER_{X,Y}.NARROW
+    #
+    def set_narrow_from_input(self, bank_crt, sel_narrow, sel_input):
+        bank_crt.ladder_x._set_narrow(sel_narrow, self.inp._get_value(sel_input))
+        bank_crt.ladder_y._set_narrow(sel_narrow, self.inp._get_value(sel_input))
+
+    #
+    # copy from INPUT to CRT_(X|Y).LADDER_{X,Y}.WIDE
+    #
+    def set_wide_from_input(self, bank_crt, sel_wide, sel_input):
+        bank_crt.ladder_x._set_wide(sel_wide, self.inp._get_value(sel_input))
+        bank_crt.ladder_y._set_wide(sel_wide, self.inp._get_value(sel_input))
+
+    #
+    # copy from CRT_Y.LADDER_{X,Y).{WIDE,NARROW} to CRT_X.LADDER_{X,Y}.{WIDE,NARROW}
+    #
+    def copy_crt_y2x(self, sel_wide, sel_narrow):
+
+        self.bnk.crt_x.ladder_x._set_wide(sel_wide, self.bnk.crt_y.ladder_x._get_wide(sel_wide))
+        self.bnk.crt_x.ladder_y._set_wide(sel_wide, self.bnk.crt_y.ladder_y._get_wide(sel_wide))
+
+        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow))
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow, self.bnk.crt_y.ladder_y._get_narrow(sel_narrow))
+
+    #
+    # copy from CRT_{X,Y}.LADDER_X.{WIDE,NARROW} to CRT_{X,Y}.LADDER_Y.{WIDE,NARROW}
+    #
+    def copy_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
+
+        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, self.bnk.crt_x.ladder_x._get_wide(sel_wide_in))
+        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, self.bnk.crt_y.ladder_x._get_wide(sel_wide_in))
+
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in))
+        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in))
+
+    #
+    # copy from CRT_{X,Y}.LADDER_X.{WIDE,NARROW} to CRT_{Y,X}.LADDER_Y.{WIDE,NARROW}
+    #
+    def cross_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
+
+        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, self.bnk.crt_y.ladder_x._get_wide(sel_wide_in))
+        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, self.bnk.crt_x.ladder_x._get_wide(sel_wide_in))
         
-    def modular_multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True)):
-        
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in))
+        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in))
+
+    #
+    # modular multiply sel_wide_in by sel_narrow_in
+    # stores intermediate result in WIDE.L and WIDE.H
+    # needs modulus WIDE.N and speed-up coefficients NARROW.N_COEFF to be filled
+    # places two copies of resulting quantity in sel_wide_out and sel_narrow_out
+    # sel_*_in and sel_*_out can overlap (overwriting of input operands is ok)
+    #
+    def modular_multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True), d=False):
+
         xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
         yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        
+
         xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
         yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        
+
         xxa       = self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in)
         xya       = self.bnk.crt_x.ladder_y.wide._get_value(sel_wide_in)
 
         yxa       = self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in)
         yya       = self.bnk.crt_y.ladder_y.wide._get_value(sel_wide_in)
-        
+
         xxb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
         xyb       = self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow_in)
 
         yxb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
         yyb       = self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow_in)
-        
+
         if not mode[0]: xb = xxb
         else:           xb = xyb
 
         if not mode[1]: yb = yxb
         else:           yb = yyb
 
-        xxp = self.wrk.multiply(xxa, xb, xn, xn_coeff, num_words)
-        xyp = self.wrk.multiply(xya, xb, xn, xn_coeff, num_words)
+        xxp = self.wrk.multipurpose_multiply(xxa, xb, xn, xn_coeff, num_words, dump=d, dump_crt="X", dump_ladder="X")
+        xyp = self.wrk.multipurpose_multiply(xya, xb, xn, xn_coeff, num_words, dump=d, dump_crt="X", dump_ladder="Y")
 
-        yxp = self.wrk.multiply(yxa, yb, yn, yn_coeff, num_words)
-        yyp = self.wrk.multiply(yya, yb, yn, yn_coeff, num_words)
-        
-        if sel_wide_out is not None:
-            self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xxp)
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, xyp)
-            self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yxp)
-            self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, yyp)
-        
-        if sel_narrow_out is not None:
-            self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xxp)
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, xyp)
-            self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yxp)
-            self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yyp)
+        yxp = self.wrk.multipurpose_multiply(yxa, yb, yn, yn_coeff, num_words, dump=d, dump_crt="Y", dump_ladder="X")
+        yyp = self.wrk.multipurpose_multiply(yya, yb, yn, yn_coeff, num_words, dump=d, dump_crt="Y", dump_ladder="Y")
+
+        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xxp)
+        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, xyp)
+        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yxp)
+        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, yyp)
+
+        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xxp)
+        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xyp)
+        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yxp)
+        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yyp)
 
+    #
+    # modular subtract values in sel_narrow_in (X-Y)
+    # stores two copies of the result in sel_*_out
+    #
     def modular_subtract(self, sel_narrow_in, sel_narrow_out, sel_wide_out, num_words):
 
         xa = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
@@ -1128,123 +1196,105 @@ class ModExpNG_Core():
         ya = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
         yb = self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow_in)
         yn = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        
-        xd = self.wrk.subtract(xa, xb, xn, num_words)
-        yd = self.wrk.subtract(ya, yb, yn, num_words)
-        
+
+        xd = self.wrk.serial_subtract_modular(xa, xb, xn, num_words)
+        yd = self.wrk.serial_subtract_modular(ya, yb, yn, num_words)
+
         self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xd)
         self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yd)
 
         self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xd)
         self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yd)
-        
-    def reduce_narrow(self, sel_narrow, num_words):
-        self.wrk.reduce(self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow), num_words)
-        self.wrk.reduce(self.bnk.crt_x.ladder_y.narrow._get_value(sel_narrow), num_words)
-        self.wrk.reduce(self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow), num_words)
-        self.wrk.reduce(self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow), num_words)
+    
+    #
+    # modular reduce sel_narrow_in
+    # stores two copies of the result in sel_*_out
+    #
+    def modular_reduce(self, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words):
+
+        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
+
+        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
+        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
 
+        xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
+        yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
+
+        xp = self.wrk.multipurpose_multiply(None, xb, xn, xn_coeff, num_words, reduce_only=True)
+        yp = self.wrk.multipurpose_multiply(None, yb, yn, yn_coeff, num_words, reduce_only=True)
+
+        self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xp)
+        self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, xp)
+        self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yp)
+        self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, yp)
+
+        self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xp)
+        self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, xp)
+        self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yp)
+        self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yp)
+
+    #
+    # propagate carries (convert to non-redundant representation) content in sel_narrow
+    # overwrites input value
+    #
+    def propagate_carries(self, sel_narrow, num_words):
+        self.wrk.convert_nonredundant(self.bnk.crt_x.ladder_x._get_narrow(sel_narrow), num_words)
+        self.wrk.convert_nonredundant(self.bnk.crt_x.ladder_y._get_narrow(sel_narrow), num_words)
+        self.wrk.convert_nonredundant(self.bnk.crt_y.ladder_x._get_narrow(sel_narrow), num_words)
+        self.wrk.convert_nonredundant(self.bnk.crt_y.ladder_y._get_narrow(sel_narrow), num_words)
+
+    #
+    # copy from CRT_{X,Y}.LADDER_{X,Y}.WIDE.{H,L} to CRT_{X,Y}.LADDER_{X,Y}.NARROW
+    #
     def merge_lha(self, sel_narrow, num_words):
         xx_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
-        xy_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
-        yx_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
-        yy_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
-        
+        xy_lsb = self.bnk.crt_x.ladder_y._get_wide(ModExpNG_WideBankEnum.L)
+        yx_lsb = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
+        yy_lsb = self.bnk.crt_y.ladder_y._get_wide(ModExpNG_WideBankEnum.L)
+
         xx_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
-        xy_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
-        yx_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
-        yy_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
-        
+        xy_msb = self.bnk.crt_x.ladder_y._get_wide(ModExpNG_WideBankEnum.H)
+        yx_msb = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
+        yy_msb = self.bnk.crt_y.ladder_y._get_wide(ModExpNG_WideBankEnum.H)
+
         xx = xx_lsb.words + xx_msb.words
         xy = xy_lsb.words + xy_msb.words
         yx = yx_lsb.words + yx_msb.words
         yy = yy_lsb.words + yy_msb.words
-        
+
         self.bnk.crt_x.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xx))
         self.bnk.crt_x.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xy))
         self.bnk.crt_y.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yx))
         self.bnk.crt_y.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yy))
 
-    def modular_reduce(self, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words):
-    
-        xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        
-        xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        
-        xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
-        yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
-        
-        xp = self.wrk.multiply(None, xb, xn, xn_coeff, num_words, reduce_only=True)
-        yp = self.wrk.multiply(None, yb, yn, yn_coeff, num_words, reduce_only=True)
-        
-        if sel_wide_out is not None:
-            self.bnk.crt_x.ladder_x.wide._set_value(sel_wide_out, xp)
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, xp)
-            self.bnk.crt_y.ladder_x.wide._set_value(sel_wide_out, yp)
-            self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, yp)
-        
-        if sel_narrow_out is not None:
-            self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow_out, xp)
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, xp)
-            self.bnk.crt_y.ladder_x.narrow._set_value(sel_narrow_out, yp)
-            self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, yp)
-
-    def set_output(self, sel_output, banks_ladder, sel_narrow):
-        self.out._set_value(sel_output, banks_ladder.ladder_x.narrow._get_value(sel_narrow))
-    
-    def move_crt_y2x(self, sel_wide, sel_narrow):
-    
-        if sel_wide is not None:
-            self.bnk.crt_x.ladder_x.wide._set_value(sel_wide, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide))
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide, self.bnk.crt_y.ladder_y.wide._get_value(sel_wide))
-
-        if sel_narrow is not None:
-            self.bnk.crt_x.ladder_x.narrow._set_value(sel_narrow, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow))
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow, self.bnk.crt_y.ladder_y.narrow._get_value(sel_narrow))
-
-    def move_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
-        
-        if sel_wide_out is not None:
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in))
-            self.bnk.crt_y.ladder_y.wide._set_value(sel_wide_out, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in))
-
-        if sel_narrow_out is not None:
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in))
-            self.bnk.crt_y.ladder_y.narrow._set_value(sel_narrow_out, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in))
-
-    def flip_ladder_y2x(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):
-        
-        if sel_wide_out is not None:
-            self.bnk.crt_x.ladder_y.wide._set_value(sel_wide_out, self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in))
-
-        if sel_narrow_out is not None:
-            self.bnk.crt_x.ladder_y.narrow._set_value(sel_narrow_out, self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in))
-
-    def just_multiply(self, sel_wide_in, sel_narrow_in, num_words):
+    #
+    # multiply sel_wide_in by sel_narrow_in
+    # stores twice larger product in WIDE.L and WIDE.H
+    #
+    def regular_multiply(self, sel_wide_in, sel_narrow_in, num_words):
 
         xn       = self.bnk.crt_x.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
         yn       = self.bnk.crt_y.ladder_x.wide._get_value(ModExpNG_WideBankEnum.N)
-        
+
         xn_coeff = self.bnk.crt_x.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
         yn_coeff = self.bnk.crt_y.ladder_x.narrow._get_value(ModExpNG_NarrowBankEnum.N_COEFF)
-        
+
         xxa       = self.bnk.crt_x.ladder_x.wide._get_value(sel_wide_in)
         xya       = self.bnk.crt_x.ladder_y.wide._get_value(sel_wide_in)
 
         yxa       = self.bnk.crt_y.ladder_x.wide._get_value(sel_wide_in)
         yya       = self.bnk.crt_y.ladder_y.wide._get_value(sel_wide_in)
-        
+
         xb       = self.bnk.crt_x.ladder_x.narrow._get_value(sel_narrow_in)
         yb       = self.bnk.crt_y.ladder_x.narrow._get_value(sel_narrow_in)
-        
-        xxp = self.wrk.multiply(xxa, xb, None, None, num_words, multiply_only=True)
-        xyp = self.wrk.multiply(xya, xb, None, None, num_words, multiply_only=True)
 
-        yxp = self.wrk.multiply(yxa, yb, None, None, num_words, multiply_only=True)
-        yyp = self.wrk.multiply(yya, yb, None, None, num_words, multiply_only=True)
-        
+        xxp = self.wrk.multipurpose_multiply(xxa, xb, None, None, num_words, multiply_only=True)
+        xyp = self.wrk.multipurpose_multiply(xya, xb, None, None, num_words, multiply_only=True)
+
+        yxp = self.wrk.multipurpose_multiply(yxa, yb, None, None, num_words, multiply_only=True)
+        yyp = self.wrk.multipurpose_multiply(yya, yb, None, None, num_words, multiply_only=True)
+
         xxp_lsb = xxp.lower_half()
         xxp_msb = xxp.upper_half()
 
@@ -1256,7 +1306,7 @@ class ModExpNG_Core():
 
         yyp_lsb = yyp.lower_half()
         yyp_msb = yyp.upper_half()
-                
+
         self.bnk.crt_x.ladder_x.wide._set_value(ModExpNG_WideBankEnum.L, xxp_lsb)
         self.bnk.crt_x.ladder_y.wide._set_value(ModExpNG_WideBankEnum.L, xyp_lsb)
         self.bnk.crt_y.ladder_x.wide._set_value(ModExpNG_WideBankEnum.L, yxp_lsb)
@@ -1266,8 +1316,12 @@ class ModExpNG_Core():
         self.bnk.crt_x.ladder_y.wide._set_value(ModExpNG_WideBankEnum.H, xyp_msb)
         self.bnk.crt_y.ladder_x.wide._set_value(ModExpNG_WideBankEnum.H, yxp_msb)
         self.bnk.crt_y.ladder_y.wide._set_value(ModExpNG_WideBankEnum.H, yyp_msb)
-    
-    def just_add(self, sel_narrow_a_in, sel_narrow_b_in, sel_narrow_out, num_words):
+
+    #
+    # adds sel_narrow_a_in to sel_narrow_b_in
+    # stores result in sel_narrow_out
+    #
+    def regular_add(self, sel_narrow_a_in, sel_narrow_b_in, sel_narrow_out, num_words):
         xxa = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_a_in)
         xya = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_a_in)
         yxa = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_a_in)
@@ -1277,41 +1331,317 @@ class ModExpNG_Core():
         xyb = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_b_in)
         yxb = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_b_in)
         yyb = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_b_in)
-        
-        xxc = self.wrk.add(xxa, xxb, num_words)
-        xyc = self.wrk.add(xya, xyb, num_words)
-        yxc = self.wrk.add(yxa, yxb, num_words)
-        yyc = self.wrk.add(yya, yyb, num_words)
-        
+
+        xxc = self.wrk.serial_add_uneven(xxa, xxb, num_words)
+        xyc = self.wrk.serial_add_uneven(xya, xyb, num_words)
+        yxc = self.wrk.serial_add_uneven(yxa, yxb, num_words)
+        yyc = self.wrk.serial_add_uneven(yya, yyb, num_words)
+
         self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xxc)
         self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xyc)
         self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yxc)
         self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yyc)
-        
-        
+
+    #
+    # dump working variables before ladder step
+    #
+    def dump_before_step_crt(self, pq, m):
+        print("num_words = %d" % pq)
+        print("\rladder_mode_x = %d" % m[0])
+        print("\rladder_mode_y = %d" % m[1])
+        self.bnk.crt_x.ladder_x._get_narrow(N.C).format_verilog_concat("X_X")
+        self.bnk.crt_x.ladder_y._get_narrow(N.C).format_verilog_concat("X_Y")
+        self.bnk.crt_y.ladder_x._get_narrow(N.C).format_verilog_concat("Y_X")
+        self.bnk.crt_y.ladder_y._get_narrow(N.C).format_verilog_concat("Y_Y")
+        self.bnk.crt_x.ladder_x._get_wide(W.N).format_verilog_concat("X_N")
+        self.bnk.crt_x.ladder_x._get_wide(W.N).format_verilog_concat("Y_N")
+        self.bnk.crt_x.ladder_x._get_narrow(N.N_COEFF).format_verilog_concat("X_N_COEFF")
+        self.bnk.crt_x.ladder_x._get_narrow(N.N_COEFF).format_verilog_concat("Y_N_COEFF")
+
+    #
+    # dump working variables after ladder step
+    #
+    def dump_after_step_crt(self):
+        self.bnk.crt_x.ladder_x._get_narrow(N.C).format_verilog_concat("X_X")
+        self.bnk.crt_x.ladder_y._get_narrow(N.C).format_verilog_concat("X_Y")
+        self.bnk.crt_y.ladder_x._get_narrow(N.C).format_verilog_concat("Y_X")
+        self.bnk.crt_y.ladder_y._get_narrow(N.C).format_verilog_concat("Y_Y")
+
+    #
+    # this deliberately converts narrow operand into redundant representation
+    #
+    def _force_overflow(self, bank_crt, sel_narrow):
+
+        # original words
+        T = bank_crt.ladder_x._get_narrow(sel_narrow).words
+
+        # loop through upper N-1 words
+        for i in range(1, len(T)):
+
+            # get msbs of the previous word
+            upper_bits = T[i-1] & _CARRY_MASK
+
+            # if the previous msbs are empty, force lsbs of the current word
+            # into them and then wipe the current lsbs
+            if upper_bits == 0:
+                lower_bits = T[i] & (_CARRY_MASK >> _WORD_WIDTH)
+                T[i] ^= lower_bits
+                T[i-1] |= (lower_bits << _WORD_WIDTH)
+
+        # overwrite original words
+        bank_crt.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, len(T), T))
+
+        print("Forced overflow.")
+
+#
+# read content of core's output bank and compare it against known good values
+#
+def compare_signature():
+
+    c  = core
+    s  = s_known
+    xm = xm_known
+    ym = ym_known
+
+    core_s  = c.out.get_value(O.S)
+    core_xm = c.out.get_value(O.XM)
+    core_ym = c.out.get_value(O.YM)
+
+    if core_s.number()  != s:  print("ERROR: core_s != s!")
+    else:                      print("s is OK")
+
+    if core_xm.number() != xm: print("ERROR: core_xm != xm!")
+    else:                      print("x_mutated is OK")
+
+    if core_ym.number() != ym: print("ERROR: core_ym != ym!")
+    else:                      print("y_mutated is OK")
+
+#
+# get current ladder mode based on two exponents' bits
+#
+def get_ladder_mode_using_crt(v, bit):
+
+    bit_value_p = (v.dp.number() & (1 << bit)) >> bit
+    bit_value_q = (v.dq.number() & (1 << bit)) >> bit
+
+    bit_value_p = bit_value_p > 0
+    bit_value_q = bit_value_q > 0
+
+    return (bit_value_p, bit_value_q)
+
+#
+# print current exponentiation progress
+#
+def print_ladder_progress(current, total):
+
+    # this will always print "100.0%" at the very last iteration, since we're
+    # counting bits from msb to lsb and the very last index is zero, which
+    # is congruent to 0 mod DUMP_PROGRESS_FACTOR
+    if (current % DUMP_PROGRESS_FACTOR) == 0:
+        pct = float((_WORD_WIDTH * total - current) / (_WORD_WIDTH * total)) * 100.0
+        print("\rdone: %5.1f%%" % pct, end='')
+
+    # move to next line after the very last iteration
+    if current == 0: print("")
+
+#
+# try to exponentiate using the quad-multiplier (dual-core, dual-ladder) scheme
+#
+def sign_using_crt():
+
+    c  = core
+    v  = vector
+    n  = n_num_words
+    pq = pq_num_words
+
+    ff = (False, False)
+                                                                   #
+                                                                   # A / B => different content in banks (A in WIDE, B in NARROW)
+                                                                   # [XY]Z => different content in ladders (XZ in X, YZ in Y)
+                                                                   # ..    => temporarily half-filled bank (omitted to save space)
+                                                                   # *     => "crossed" content (X.Y == Y.X and Y.Y == X.X)
+                                                                   #
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+                                                                   # |  A                     |  B    |  C               |  D      |  E        |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_wide_from_input   (c.bnk.crt_x, W.N,       I.N)          # |  ?                     |  ?    |  ?               |  ?      | ?         |
+    c.set_wide_from_input   (c.bnk.crt_y, W.N,       I.N)          # |  ?                     |  ?    |  ?               |  ?      | ?         |
+    c.set_wide_from_input   (c.bnk.crt_x, W.A,       I.X)          # |  ..                    |  ?    |  ?               |  ?      | ?         |
+    c.set_wide_from_input   (c.bnk.crt_y, W.A,       I.Y)          # | [XY] / ?               |  ?    |  ?               |  ?      | ?         |
+    c.set_wide_from_input   (c.bnk.crt_x, W.E,       I.M)          # | [XY] / ?               |  ?    |  ?               |  ?      | .. / ?    |
+    c.set_wide_from_input   (c.bnk.crt_y, W.E,       I.M)          # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_narrow_from_input (c.bnk.crt_x, N.N_COEFF, I.N_COEFF)    # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
+    c.set_narrow_from_input (c.bnk.crt_y, N.N_COEFF, I.N_COEFF)    # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
+    c.set_narrow_from_input (c.bnk.crt_x, N.A,       I.N_FACTOR)   # | [XY] / ..              |  ?    |  ?               |  ?      | M  / ?    |
+    c.set_narrow_from_input (c.bnk.crt_y, N.A,       I.N_FACTOR)   # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M  / ?    |
+    c.set_narrow_from_input (c.bnk.crt_x, N.E,       I.M)          # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M  / ..   |
+    c.set_narrow_from_input (c.bnk.crt_y, N.E,       I.M)          # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.A, N.A, W.B, N.B, n)                      # | [XY] / N_FACTOR        | [XY]F |  ?               |  ?      | M         | [XY]F = [XY] * N_FACTOR
+    c.modular_multiply(W.B, N.B, W.C, N.C, n, mode=ff)             # | [XY] / N_FACTOR        | [XY]F | [XY]YM           |  ?      | M         | [XY]MF = [XY]F * [XY]F
+    c.modular_multiply(W.C, N.I, W.D, N.D, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         | [XY]M = [XY]MF * 1
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.D, n_num_words)                          # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_output_from_narrow(O.XM, c.bnk.crt_x, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
+    c.set_output_from_narrow(O.YM, c.bnk.crt_y, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.E, N.B, W.C, N.C, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         | [XY]MB = M*[XY]F
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.C, n_num_words)                          # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.copy_crt_y2x(W.C, N.C)                                       # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_wide_from_input  (c.bnk.crt_x, W.N,       I.P)           # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
+    c.set_wide_from_input  (c.bnk.crt_y, W.N,       I.Q)           # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
+    c.set_wide_from_input  (c.bnk.crt_x, W.A,       I.P_FACTOR)    # | ...         / N_FACTOR | [XY]F |  YMB             | [XY]M   | M         |
+    c.set_wide_from_input  (c.bnk.crt_y, W.A,       I.Q_FACTOR)    # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | M         |
+    c.set_wide_from_input  (c.bnk.crt_x, W.E,       I.QINV)        # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | ..        |
+    c.set_wide_from_input  (c.bnk.crt_x, W.E,       I.QINV)        # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_narrow_from_input(c.bnk.crt_x, N.N_COEFF, I.P_COEFF)     # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
+    c.set_narrow_from_input(c.bnk.crt_y, N.N_COEFF, I.Q_COEFF)     # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
+    c.set_narrow_from_input(c.bnk.crt_x, N.A,       I.P_FACTOR)    # | [PQ]_FACTOR / ...      | [XY]F |  YMB             | [XY]M   | QINV / M  |
+    c.set_narrow_from_input(c.bnk.crt_y, N.A,       I.Q_FACTOR)    # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV / M  |
+    c.set_narrow_from_input(c.bnk.crt_x, N.E,       I.QINV)        # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV / .. |
+    c.set_narrow_from_input(c.bnk.crt_x, N.E,       I.QINV)        # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV      |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_reduce(N.C, W.D, N.D, pq)                            # | [PQ]_FACTOR            | [XY]F |  YMB             | [PQ]MBZ | QINV      | [PQ]MBZ = YMB mod [PQ]
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.D, N.A, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBZ | QINV      | [PQ]MB = [PQ]MBZ * [PQ]_FACTOR
+    c.modular_multiply(W.C, N.A, W.D, N.D, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBF | QINV      | [PQ]MBF = [PQ]MB * [PQ]_FACTOR
+    c.modular_multiply(W.A, N.I, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]IF           | [PQ]MBF | QINV      | [PQ]IF = 1 * [PQ]_FACTOR
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.copy_ladders_x2y(W.D, N.D, W.C, N.C)                         # | [PQ]_FACTOR            | [XY]F | [PQ]IF / [PQ]MBF | [PQ]MBF | QINV      |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    ###########################                                    # |                        |       |                  |         |           |
+    # Begin Montgomery Ladder #                                    # |                        |       |                  |         |           |
+    ###########################                                    # |                        |       |                  |         |           |
+                                                                   # |                        |       |                  |         |           |
+    for bit in range(_WORD_WIDTH * pq - 1, -1, -1):                # |                        |       |                  |         |           |
+                                                                   # |                        |       |                  |         |           |
+        m  = get_ladder_mode_using_crt(v, bit)                     # |                        |       |                  |         |           |
+        dbg = bit == DUMP_LADDER_INDEX                             # |                        |       |                  |         |           |
+                                                                   # |                        |       |                  |         |           |
+        if dbg:                                                    # |                        |       |                  |         |           |
+            if FORCE_OVERFLOW: c._force_overflow(c.bnk.crt_x, N.C) # |                        |       |                  |         |           |
+            if DUMP_VECTORS: c.dump_before_step_crt(pq, m)         # |                        |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+        c.modular_multiply(W.C, N.C, W.C, N.C, pq, mode=m, d=dbg)  # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]MBF | QINV      | <LADDER>
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+        if dbg and DUMP_VECTORS: c.dump_after_step_crt()           # |                        |       |                  |         |           |
+        print_ladder_progress(bit, pq)                             # |                        |       |                  |         |           |
+                                                                   # |                        |       |                  |         |           |
+    #########################                                      # |                        |       |                  |         |           |
+    # End Montgomery Ladder #                                      # |                        |       |                  |         |           |
+    #########################                                      # |                        |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.C, N.I, W.D, N.D, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB  | QINV      | [PQ]SB = [PQ]SBF * 1
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.D, pq)                                   # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB  | QINV      |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.cross_ladders_x2y(W.D, N.D, W.D, N.D)                        # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB* | QINV      |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_subtract(N.D, N.C, W.C, pq)                          # | [PQ]_FACTOR            | [XY]F |  RSB             | [PQ]SB* | QINV      | RSB = PSB - QSB
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.C, N.E, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F |  RSBIZ           | [PQ]SB* | QINV      | RSBIZ = RSB * QINV
+    c.modular_multiply(W.C, N.A, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | QINV      | RSBI = RSBIZ * P_FACTOR
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_wide_from_input  (c.bnk.crt_x, W.E, I.Q)                 # | [PQ]_FACTOR / N_FACTOR | [XY]F |  RSBI            | [PQ]SB* | ..        |
+    c.set_wide_from_input  (c.bnk.crt_x, W.E, I.Q)                 # | [PQ]_FACTOR / N_FACTOR | [XY]F |  RSBI            | [PQ]SB* | Q / QINV  |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_narrow_from_input(c.bnk.crt_x, N.E, I.Q)                 # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q / ..    |
+    c.set_narrow_from_input(c.bnk.crt_x, N.E, I.Q)                 # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.regular_multiply(W.E, N.C, pq)                               # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q         | = RSBI * Q
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.merge_lha(N.A, pq)                                           # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            | [PQ]SB* | Q         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.A, n)                                    # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            | [PQ]SB* | Q         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.copy_crt_y2x(W.D, N.D)                                       # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            |  QSB*   | Q         |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.regular_add(N.D, N.A, N.C, pq)                               # | [PQ]_FACTOR / QRSBI    | [XY]F |  SB              |  QSB*   | Q         | SB = QSB + RSBI
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_wide_from_input  (c.bnk.crt_x, W.N, I.N)                 # |                        |       |                  |         |           |
+    c.set_wide_from_input  (c.bnk.crt_y, W.N, I.N)                 # |                        |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_narrow_from_input(c.bnk.crt_x, N.N_COEFF, I.N_COEFF)     # |                        |       |                  |         |           |
+    c.set_narrow_from_input(c.bnk.crt_y, N.N_COEFF, I.N_COEFF)     # |                        |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.modular_multiply(W.B, N.C, W.A, N.A, n, ff)                  # |  S                     |       |                  |         |           | S = XF * SB
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.propagate_carries(N.A, n)                                    # |  S                     |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+    c.set_output_from_narrow(O.S, c.bnk.crt_x, N.A)                # |  S                     |       |                  |         |           |
+                                                                   # +------------------------+-------+------------------+---------+-----------+
+#
+# main()
+#
 if __name__ == "__main__":
 
+    # handy shortcuts
+    W = ModExpNG_WideBankEnum
+    N = ModExpNG_NarrowBankEnum
+    I = ModExpNG_CoreInputEnum
+    O = ModExpNG_CoreOutputEnum
+
+    # set helper quantity
+    # instantiate core
     # load test vector
-    # create worker
+    # transfer numbers from vector to core
     # set numbers of words
     # obtain known good reference value with built-in math
-    # create helper quantity
     # mutate blinding quantities with built-in math
 
-    n_num_words  = KEY_LENGTH  // _WORD_WIDTH
-    pq_num_words = n_num_words // 2
-
     i = ModExpNG_Operand(1, KEY_LENGTH)
 
-    vector = ModExpNG_TestVector()
     core   = ModExpNG_Core(i)
-    
+    vector = ModExpNG_TestVector()
+
+    core.inp.set_value(I.M,        vector.m)
+
+    core.inp.set_value(I.N,        vector.n)
+    core.inp.set_value(I.P,        vector.p)
+    core.inp.set_value(I.Q,        vector.q)
+
+    core.inp.set_value(I.N_COEFF,  vector.n_coeff)
+    core.inp.set_value(I.P_COEFF,  vector.p_coeff)
+    core.inp.set_value(I.Q_COEFF,  vector.q_coeff)
+
+    core.inp.set_value(I.N_FACTOR, vector.n_factor)
+    core.inp.set_value(I.P_FACTOR, vector.p_factor)
+    core.inp.set_value(I.Q_FACTOR, vector.q_factor)
+
+    core.inp.set_value(I.X,        vector.x)
+    core.inp.set_value(I.Y,        vector.y)
+
+    core.inp.set_value(I.QINV,     vector.qinv)
+
+    n_num_words  = KEY_LENGTH  // _WORD_WIDTH
+    pq_num_words = n_num_words // 2
+
     s_known = pow(vector.m.number(), vector.d.number(), vector.n.number())
 
-    x_mutated_known = pow(vector.x.number(), 2, vector.n.number())
-    y_mutated_known = pow(vector.y.number(), 2, vector.n.number())
+    xm_known = pow(vector.x.number(), 2, vector.n.number())
+    ym_known = pow(vector.y.number(), 2, vector.n.number())
+
+    # sign using CRT
+    print("Signing using CRT...")
+    sign_using_crt()
+    compare_signature()
+
+    # sign without CRT
+    # ...
+
+
+#
+# End-of-File
+#
+
+
 
-    
     # bring one into Montgomery domain (glue 2**r to one)
     # bring blinding coefficients into Montgomery domain (glue 2**(2*r) to x and y)
     # blind message
@@ -1329,129 +1659,3 @@ if __name__ == "__main__":
     # unblind s
     # mutate blinding factors
 
-    W = ModExpNG_WideBankEnum
-    N = ModExpNG_NarrowBankEnum
-    O = ModExpNG_CoreOutputEnum
-    
-    core.bnk.crt_x.set_modulus(vector.n, vector.n_coeff)
-    core.bnk.crt_y.set_modulus(vector.n, vector.n_coeff)
-    
-    core.bnk.crt_x.set_operands_crt_xy(W.A, N.A, vector.x, vector.n_factor)
-    core.bnk.crt_y.set_operands_crt_xy(W.A, N.A, vector.y, vector.n_factor)
-
-    core.bnk.crt_x.set_operands_crt_xy(W.E, N.E, vector.m, vector.m)
-    core.bnk.crt_y.set_operands_crt_xy(W.E, N.E, vector.m, vector.m)
-                                                                                   #                             | A               | B     | C                | D       | E |
-                                                                                   #                             +-----------------+-------+------------------+---------+---+
-                                                                                   #                             | [XY] ; N_FACTOR | ?     | ?                | ?       | M |
-    core.modular_multiply(W.A, N.A, W.B, N.B, n_num_words)                         # [XY]F  =[XY]*N_FACTOR       | [XY] ; N_FACTOR | [XY]F | ?                | ?       | M |
-    core.modular_multiply(W.B, N.B, W.C, N.C, n_num_words, mode=(False, False))    # [XY]MF =[XY]F*[XY]F         | [XY] ; N_FACTOR | [XY]F | [XY]YM           | ?       | M |
-    core.modular_multiply(W.C, N.I, W.D, N.D, n_num_words)                         # [XY]M  =[XY]MF*1            | [XY] ; N_FACTOR | [XY]F | [XY]YM           | [XY]M   | M |
-    core.reduce_narrow(N.D, n_num_words)                                           #                             |                 |       |                  |         |   |
-    core.set_output(O.XM, core.bnk.crt_x, N.D)                                     #                             |                 |       |                  |         |   |
-    core.set_output(O.YM, core.bnk.crt_y, N.D)                                     #                             |                 |       |                  |         |   |
-    core.modular_multiply(W.E, N.B, W.C, N.C, n_num_words)                         # [XY]MB =M*[XY]F             | [XY] ; N_FACTOR | [XY]F | [XY]MB           | [XY]M   | M |
-    core.move_crt_y2x(W.C, N.C)                                                    #                             | [XY] ; N_FACTOR | [XY]F | YMB              | [XY]M   | M |
-    core.bnk.crt_x.set_modulus(vector.p, vector.p_coeff)                           #                             |                 |       |                  |         |   |
-    core.bnk.crt_y.set_modulus(vector.q, vector.q_coeff)                           #                             |                 |       |                  |         |   |
-    core.bnk.crt_x.set_operands_crt_xy(W.A, N.A, vector.p_factor, vector.p_factor) #                             | [PQ]_FACTOR     | [XY]F | YMB              | [XY]M   | M |
-    core.bnk.crt_y.set_operands_crt_xy(W.A, N.A, vector.q_factor, vector.q_factor) #                             | [PQ]_FACTOR     | [XY]F | YMB              | [XY]M   | M |
-    core.reduce_narrow(N.C, n_num_words)                                           #                             |                 |       |                  |         |   |
-    core.modular_reduce(N.C, W.D, N.D, pq_num_words)                               #                             | [PQ]_FACTOR     | [XY]F | YMB              | [PQ]MBZ | M |
-    core.modular_multiply(W.D, N.A, W.C, N.C, pq_num_words)                        # [PQ]MB =[PQ]MBZ*[PQ]_FACTOR | [PQ]_FACTOR     | [XY]F | [PQ]MB           | [PQ]MBZ | M |
-    core.modular_multiply(W.C, N.A, W.D, N.D, pq_num_words)                        # [PQ]MBF=[PQ]MB*[PQ]_FACTOR  | [PQ]_FACTOR     | [XY]F | [PQ]MB           | [PQ]MBF | M |
-    core.modular_multiply(W.A, N.I, W.C, N.C, pq_num_words)                        # [PQ]MBF=[PQ]MB*[PQ]_FACTOR  | [PQ]_FACTOR     | [XY]F | [PQ]IF           | [PQ]MBF | M |
-    core.move_ladders_x2y(W.D, N.D, W.C, N.C)                                      #                             | [PQ]_FACTOR     | [XY]F | [PQ]IF / [PQ]MBF | [PQ]MBF | M |
-    
-    #PIF = core.bnk.crt_x.ladder_x.narrow._get_value(N.C)#
-    #QIF = core.bnk.crt_y.ladder_x.narrow._get_value(N.C)#
-    
-
-
-
-########################
-
-    for bit in range(_WORD_WIDTH * pq_num_words - 1, -1, -1):
-    
-        bit_value_p = (vector.dp.number() & (1 << bit)) >> bit
-        bit_value_q = (vector.dq.number() & (1 << bit)) >> bit
-
-        bit_value_p = bit_value_p > 0
-        bit_value_q = bit_value_q > 0
-            
-        # mode = ... (shorted the next line for better readability)
-            
-        core.modular_multiply(W.C, N.C, W.C, N.C, pq_num_words, mode=(bit_value_p, bit_value_q))    # <LADDER>   | [PQ]_FACTOR     | [XY]F | [PQ]SBF          | [PQ]MBF | M |
-        
-        if (bit % 4) == 0:
-            pct = float((_WORD_WIDTH * pq_num_words - bit) / (_WORD_WIDTH * pq_num_words)) * 100.0
-            print("\rdone: %5.1f%%" % pct, end='')
-        
-    print("")
-
-    core.modular_multiply(W.C, N.I, W.D, N.D, pq_num_words)                        # [PQ]SB=[PQ]SBF*1            | [PQ]_FACTOR     | [XY]F | [PQ]SBF          | [PQ]SB  | M |
-
-############################
-    
-    core.reduce_narrow(N.D, pq_num_words)
-    
-    #SQB = core.bnk.crt_y.ladder_x.narrow._get_value(N.D)
-    
-    core.flip_ladder_y2x(W.D, N.D, W.D, N.D)
-    core.modular_subtract(N.D, N.C, W.C, pq_num_words)                     #                             | [PQ]_FACTOR     | [XY]F | RSB              | [PQ]SB  | M    |
-    core.bnk.crt_x.set_operands_crt_xy(W.E, N.E, vector.qinv, vector.qinv) #                             | [PQ]_FACTOR     | [XY]F | RSB              | [PQ]SB  | QINV |
-    core.bnk.crt_y.set_operands_crt_xy(W.E, N.E, vector.qinv, vector.qinv) #                             | [PQ]_FACTOR     | [XY]F | RSB              | [PQ]SB  | QINV |
-
-    core.modular_multiply(W.C, N.E, W.C, N.C, pq_num_words)                #                             | [PQ]_FACTOR     | [XY]F | RSBIZ            | [PQ]SB  | QINV |
-    core.modular_multiply(W.C, N.A, W.C, N.C, pq_num_words)                #                             | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | QINV |
-    
-    core.bnk.crt_x.set_operands_crt_xy(W.E, N.E, vector.q, vector.q) #                             | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | Q |
-    core.bnk.crt_y.set_operands_crt_xy(W.E, N.E, vector.q, vector.q) #                             | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | Q |
-    
-    core.just_multiply(W.E, N.C, pq_num_words)                            # | [PQ]_FACTOR     | [XY]F | RSBI             | [PQ]SB  | Q |
-
-    core.merge_lha(N.A, pq_num_words)
-    
-    core.reduce_narrow(N.A, n_num_words)
-
-    core.move_crt_y2x(W.D, N.D)
-    
-    #RQSBI = core.bnk.crt_x.ladder_x.narrow._get_value(N.A)
-    
-    core.just_add(N.D, N.A, N.C, pq_num_words)   # 
-    SB = core.bnk.crt_x.ladder_x._get_narrow(N.C)
-    #print(hex(SB.number()))
-
-    #SB = core.wrk.add(SQB, RQSBI, pq_num_words) # just_add
-    #print(hex(SB.number()))
-    
-    
-    # check why multiplication is not commutative!?
-    
-    
-    XF = core.bnk.crt_x.ladder_x.wide._get_value(W.B)
-
-    core.bnk.crt_x.set_modulus(vector.n, vector.n_coeff)
-    core.bnk.crt_y.set_modulus(vector.n, vector.n_coeff)
-
-    core.modular_multiply(W.B, N.C, W.A, N.A, n_num_words, mode=(False, False))
-    core.reduce_narrow(N.A, n_num_words)    
-    core.set_output(O.S, core.bnk.crt_x, N.A)
-    
-    S  = core.out.get_value(O.S)
-    XM = core.out.get_value(O.XM)
-    YM = core.out.get_value(O.YM)
-    
-    if S.number() != s_known: print("ERROR: s_crt_unblinded != s_known!")
-    else:                     print("s is OK")
-
-    if XM.number() != x_mutated_known: print("ERROR: x_mutated != x_mutated_known!")
-    else:                              print("x_mutated is OK")
-
-    if YM.number() != y_mutated_known: print("ERROR: y_mutated != y_mutated_known!")
-    else:                              print("y_mutated is OK")
-
-
-#
-# End-of-File
-#



More information about the Commits mailing list