Add optimized bignum multiplication for Aarch64.

x0-x3 are skipped such that function parameters to not have to be moved. MULADDC_INIT and MULADDC_STOP are mostly empty because it is more efficient to keep everything in registers (and that should easily be possible). I considered a MULADDC_HUIT implementation, but could not think of something that would be more efficient than basically 8 consecutive MULADDC_CORE. You could combine the loads and stores, but it's probably more efficient to interleave them with arithmetic, depending on the specific microarchitecture. NEON allows to do a 64x64->128 bit multiplication (and optional accumulation) in one instruction, but is not great at handling carries.
2025-12-14 14:31:30 +00:00 · 2018-08-16 02:01:57 -07:00 · 2018-08-16 02:01:57 -07:00 · cc1871e674
parent 03d2daf55c
commit cc1871e674
1 changed files with 24 additions and 0 deletions
--- a/include/mbedtls/bn_mul.h
+++ b/include/mbedtls/bn_mul.h
@ -198,6 +198,30 @@
 #endif /* AMD64 */
 #if defined(__aarch64__)
 #define MULADDC_INIT                \
    asm(
 #define MULADDC_CORE                \
        "ldr x4, [%3], #8   \n\t"   \
        "ldr x5, [%4]       \n\t"   \
        "mul x6, x4, %6     \n\t"   \
        "umulh x7, x4, %6   \n\t"   \
        "adds x5, x5, x6    \n\t"   \
        "adc x7, x7, xzr    \n\t"   \
        "adds x5, x5, %5    \n\t"   \
        "adc %0, x7, xzr    \n\t"   \
        "str x5, [%1], #8   \n\t"
 #define MULADDC_STOP                            \
         : "+r" (c),  "=r" (d), "=r" (s)        \
         : "r" (s), "r" (d), "r" (c), "r" (b)   \
         : "x4", "x5", "x6", "x7", "cc"         \
    );
 #endif /* Aarch64 */
 #if defined(__mc68020__) || defined(__mcpu32__)
 #define MULADDC_INIT                    \