mirror of
https://github.com/yuzu-emu/mbedtls.git
synced 2025-02-24 08:26:48 +00:00
Add optimized bignum multiplication for Aarch64.
x0-x3 are skipped such that function parameters to not have to be moved. MULADDC_INIT and MULADDC_STOP are mostly empty because it is more efficient to keep everything in registers (and that should easily be possible). I considered a MULADDC_HUIT implementation, but could not think of something that would be more efficient than basically 8 consecutive MULADDC_CORE. You could combine the loads and stores, but it's probably more efficient to interleave them with arithmetic, depending on the specific microarchitecture. NEON allows to do a 64x64->128 bit multiplication (and optional accumulation) in one instruction, but is not great at handling carries.
This commit is contained in:
parent
03d2daf55c
commit
cc1871e674
|
@ -198,6 +198,30 @@
|
||||||
|
|
||||||
#endif /* AMD64 */
|
#endif /* AMD64 */
|
||||||
|
|
||||||
|
#if defined(__aarch64__)
|
||||||
|
|
||||||
|
#define MULADDC_INIT \
|
||||||
|
asm(
|
||||||
|
|
||||||
|
#define MULADDC_CORE \
|
||||||
|
"ldr x4, [%3], #8 \n\t" \
|
||||||
|
"ldr x5, [%4] \n\t" \
|
||||||
|
"mul x6, x4, %6 \n\t" \
|
||||||
|
"umulh x7, x4, %6 \n\t" \
|
||||||
|
"adds x5, x5, x6 \n\t" \
|
||||||
|
"adc x7, x7, xzr \n\t" \
|
||||||
|
"adds x5, x5, %5 \n\t" \
|
||||||
|
"adc %0, x7, xzr \n\t" \
|
||||||
|
"str x5, [%1], #8 \n\t"
|
||||||
|
|
||||||
|
#define MULADDC_STOP \
|
||||||
|
: "+r" (c), "=r" (d), "=r" (s) \
|
||||||
|
: "r" (s), "r" (d), "r" (c), "r" (b) \
|
||||||
|
: "x4", "x5", "x6", "x7", "cc" \
|
||||||
|
);
|
||||||
|
|
||||||
|
#endif /* Aarch64 */
|
||||||
|
|
||||||
#if defined(__mc68020__) || defined(__mcpu32__)
|
#if defined(__mc68020__) || defined(__mcpu32__)
|
||||||
|
|
||||||
#define MULADDC_INIT \
|
#define MULADDC_INIT \
|
||||||
|
|
Loading…
Reference in a new issue