[Cryptech-Commits] [sw/stm32] 01/03: Add loop unrolling to bring the profilable mem* functions closer to newlib, because memset is called a LOT in the course of RSA signing, and we need to understand how much time we're actually spending there.

git at cryptech.is git at cryptech.is
Mon Dec 3 22:35:38 UTC 2018


This is an automated email from the git hooks/post-receive script.

paul at psgd.org pushed a commit to branch master
in repository sw/stm32.

commit 97034edb35e92361daaa24512989d00f6c3fd517
Author: Paul Selkirk <paul at psgd.org>
AuthorDate: Mon Nov 26 17:26:55 2018 -0500

    Add loop unrolling to bring the profilable mem* functions closer to
    newlib, because memset is called a LOT in the course of RSA signing, and
    we need to understand how much time we're actually spending there.
---
 libraries/libprof/Makefile               |  6 +++-
 memfunc.c => libraries/libprof/memfunc.c | 52 ++++++++++++++++++++++++--------
 projects/hsm/Makefile                    |  1 -
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/libraries/libprof/Makefile b/libraries/libprof/Makefile
index 4fe5fb4..37b9a23 100644
--- a/libraries/libprof/Makefile
+++ b/libraries/libprof/Makefile
@@ -1,12 +1,16 @@
 LIB = libprof.a
 
-OBJS = gmon.o profil.o profiler.o
+OBJS = gmon.o profil.o profiler.o memfunc.o
 
 # Don't profile the profiling code, because that way lies madness (and recursion).
 CFLAGS := $(subst -pg,,$(CFLAGS))
 
 all: $(LIB)
 
+# But do profile the mem functions
+memfunc.o: memfunc.c
+	$(CC) $(CFLAGS) -pg -c -o $@ $<
+
 %.o : %.c
 	$(CC) $(CFLAGS) -c -o $@ $<
 
diff --git a/memfunc.c b/libraries/libprof/memfunc.c
similarity index 52%
rename from memfunc.c
rename to libraries/libprof/memfunc.c
index fd94b28..fc908e1 100644
--- a/memfunc.c
+++ b/libraries/libprof/memfunc.c
@@ -4,9 +4,15 @@
 /*
  * Profilable substitutes for mem*(), lacking libc_p.a
  *
- * This code was written with reference to newlib, but does not copy every
- * quirk and loop-unrolling optimization from newlib. Its only purpose is
- * to let us figure out who is calling memcpy 2 million times.
+ * This code was written with reference to newlib, and was recently
+ * brought closer into line with newlib, to make profiling more accurate.
+ *
+ * Newlib is maintained by Cygwin, which is Red Hat. There is no copyright
+ * statement in the corresponding newlib source files, nor is there a
+ * COPYING file in newlib/libc/string or newlib/libc. Consider this file
+ * to be covered under one or more of the 50 copyright notices in
+ * newlib/COPYING, most of which are BSD. In any case, this file is only
+ * used for profiling, and is not used in production builds.
  */
 
 #define is_word_aligned(x) (((size_t)(x) & 3) == 0)
@@ -16,12 +22,19 @@ void *memcpy(void *dst, const void *src, size_t n)
     uint8_t *d8 = (uint8_t *)dst;
     uint8_t *s8 = (uint8_t *)src;
 
-    if (n >= 4 && is_word_aligned(src) && is_word_aligned(dst)) {
+    if (n >= sizeof(uint32_t) && is_word_aligned(src) && is_word_aligned(dst)) {
         uint32_t *d32 = (uint32_t *)dst;
         uint32_t *s32 = (uint32_t *)src;
-        while (n >= 4) {
+        while (n >= 4 * sizeof(uint32_t)) {
+            *d32++ = *s32++;
+            *d32++ = *s32++;
+            *d32++ = *s32++;
             *d32++ = *s32++;
-            n -= 4;
+            n -= 4 * sizeof(uint32_t);
+        }
+        while (n >= sizeof(uint32_t)) {
+            *d32++ = *s32++;
+            n -= sizeof(uint32_t);
         }
         d8 = (uint8_t *)d32;
         s8 = (uint8_t *)s32;
@@ -38,12 +51,25 @@ void *memset(void *dst, int c, size_t n)
     uint8_t *d8 = (uint8_t *)dst;
     uint8_t c8 = (uint8_t)c;
 
-    if (n >= 4 && is_word_aligned(dst)) {
-        uint32_t *d32 = (uint32_t *)dst;
+    while (!is_word_aligned(d8)) {
+        if (n--)
+            *d8++ = c8;
+        else
+            return dst;
+    }
+    if (n >= sizeof(uint32_t)) {
+        uint32_t *d32 = (uint32_t *)d8;
         uint32_t c32 = (c8 << 24) | (c8 << 16) | (c8 << 8) | (c8);
-        while (n >= 4) {
+        while (n >= 4 * sizeof(uint32_t)) {
+            *d32++ = c32;
+            *d32++ = c32;
+            *d32++ = c32;
+            *d32++ = c32;
+            n -= 4 * sizeof(uint32_t);
+        }
+        while (n >= sizeof(uint32_t)) {
             *d32++ = c32;
-            n -= 4;
+            n -= sizeof(uint32_t);
         }
         d8 = (uint8_t *)d32;
     }
@@ -59,15 +85,15 @@ int memcmp(const void *dst, const void *src, size_t n)
     uint8_t *d8 = (uint8_t *)dst;
     uint8_t *s8 = (uint8_t *)src;
 
-    if (n >= 4 && is_word_aligned(src) && is_word_aligned(dst)) {
+    if (n >= sizeof(uint32_t) && is_word_aligned(src) && is_word_aligned(dst)) {
         uint32_t *d32 = (uint32_t *)dst;
         uint32_t *s32 = (uint32_t *)src;
-        while (n >= 4) {
+        while (n >= sizeof(uint32_t)) {
             if (*d32 != *s32)
                 break;
             d32++;
             s32++;
-            n -= 4;
+            n -= sizeof(uint32_t);
         }
         d8 = (uint8_t *)d32;
         s8 = (uint8_t *)s32;
diff --git a/projects/hsm/Makefile b/projects/hsm/Makefile
index 3430e14..37c552d 100644
--- a/projects/hsm/Makefile
+++ b/projects/hsm/Makefile
@@ -25,7 +25,6 @@ LDFLAGS += -mfloat-abi=hard -mfpu=fpv4-sp-d16
 LDFLAGS += -Wl,--gc-sections
 
 ifdef DO_PROFILING
-OBJS += $(TOPLEVEL)/memfunc.o
 LDFLAGS += --specs=rdimon.specs -lc -lrdimon
 endif
 



More information about the Commits mailing list