mirror of
synced 2025-03-04 06:39:39 +00:00
x64 intrinsics for msvc in bn_mul, timing and aesni modules
AES-NI implementation for MSVC x64 using intrinsics. Implement rdtsc timing function using intrinsics for x64. Use 128bit-result multiply on msvc/x64.
This commit is contained in:
@ -42,6 +42,11 @@
#define MBEDTLS_HAVE_X86_64
#if defined(_MSC_VER) && defined(_M_X64) && \
! defined(MBEDTLS_HAVE_X86_64)
#define MBEDTLS_HAVE_X86_64
#if defined(MBEDTLS_HAVE_X86_64)
#ifdef __cplusplus
@ -924,10 +924,30 @@
__asm mov s, esi \
#endif /* SSE2 */
#endif /* MSVC */
#endif /* (MSVC && _M_IX86) || __WATCOMC__ */
#endif /* MBEDTLS_HAVE_ASM */
#if defined(_MSC_VER) && defined(_M_X64)
#include <intrin.h>
#define MULADDC_INIT \
{ \
mbedtls_mpi_uint r0, r1; \
unsigned char carry;
#define MULADDC_CORE \
r0 = _umul128( *(s++), b, &r1 ); \
carry = _addcarry_u64( 0, r0, c, &r0 ); \
_addcarry_u64( carry, r1, 0, &r1 ); \
carry = _addcarry_u64( 0, r0, *d, &r0 ); \
_addcarry_u64( carry, r1, 0, &r1 ); \
c = r1; *(d++) = r0;
#define MULADDC_STOP \
#endif /* _MSC_VER && _M_X64 */
#if !defined(MULADDC_CORE)
#if defined(MBEDTLS_HAVE_UDBL)
@ -68,7 +68,7 @@
#error "MBEDTLS_HAVE_TIME_DATE without MBEDTLS_HAVE_TIME does not make sense"
#if defined(MBEDTLS_AESNI_C) && !defined(MBEDTLS_HAVE_ASM)
#if defined(MBEDTLS_AESNI_C) && !defined(MBEDTLS_HAVE_ASM) && !(defined(_MSC_VER) && defined(_M_X64))
#error "MBEDTLS_AESNI_C defined, but not all prerequisites"
@ -42,6 +42,11 @@
#if defined(MBEDTLS_HAVE_X86_64)
#if defined(_MSC_VER) && defined(_M_X64)
#include <intrin.h>
* AES-NI support detection routine
@ -52,11 +57,17 @@ int mbedtls_aesni_has_support( unsigned int what )
if( ! done )
int regs[4]; // eax, ebx, ecx, edx
__cpuid( regs, 1 );
c = regs[2];
asm( "movl $1, %%eax \n\t"
"cpuid \n\t"
: "=c" (c)
: "eax", "ebx", "edx" );
done = 1;
@ -97,6 +108,28 @@ int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx,
const unsigned char input[16],
unsigned char output[16] )
__m128i* rk, a;
int i;
rk = (__m128i*)ctx->rk;
a = _mm_xor_si128( _mm_loadu_si128( (__m128i*)input ), _mm_loadu_si128( rk++ ) );
for (i = ctx->nr - 1; i; --i)
a = _mm_aesenc_si128( a, _mm_loadu_si128( rk++ ) );
a = _mm_aesenclast_si128( a, _mm_loadu_si128( rk ) );
for (i = ctx->nr - 1; i; --i)
a = _mm_aesdec_si128( a, _mm_loadu_si128( rk++ ) );
a = _mm_aesdeclast_si128( a, _mm_loadu_si128( rk ) );
_mm_storeu_si128( (__m128i*)output, a );
asm( "movdqu (%3), %%xmm0 \n\t" // load input
"movdqu (%1), %%xmm1 \n\t" // load round key 0
"pxor %%xmm1, %%xmm0 \n\t" // round 0
@ -130,10 +163,70 @@ int mbedtls_aesni_crypt_ecb( mbedtls_aes_context *ctx,
: "r" (ctx->nr), "r" (ctx->rk), "r" (mode), "r" (input), "r" (output)
: "memory", "cc", "xmm0", "xmm1" );
return( 0 );
static inline void clmul256( __m128i a, __m128i b, __m128i* r0, __m128i* r1 )
__m128i c, d, e, f, ef;
c = _mm_clmulepi64_si128( a, b, 0x00 );
d = _mm_clmulepi64_si128( a, b, 0x11 );
e = _mm_clmulepi64_si128( a, b, 0x10 );
f = _mm_clmulepi64_si128( a, b, 0x01 );
// r0 = f0^e0^c1:c0 = c1:c0 ^ f0^e0:0
// r1 = d1:f1^e1^d0 = d1:d0 ^ 0:f1^e1
ef = _mm_xor_si128( e, f );
*r0 = _mm_xor_si128( c, _mm_slli_si128( ef, 8 ) );
*r1 = _mm_xor_si128( d, _mm_srli_si128( ef, 8 ) );
static inline void sll256( __m128i a0, __m128i a1, __m128i* s0, __m128i* s1 )
__m128i l0, l1, r0, r1;
l0 = _mm_slli_epi64( a0, 1 );
l1 = _mm_slli_epi64( a1, 1 );
r0 = _mm_srli_epi64( a0, 63 );
r1 = _mm_srli_epi64( a1, 63 );
*s0 = _mm_or_si128( l0, _mm_slli_si128( r0, 8 ) );
*s1 = _mm_or_si128( _mm_or_si128( l1, _mm_srli_si128( r0, 8 ) ), _mm_slli_si128( r1, 8 ) );
static inline __m128i reducemod128( __m128i x10, __m128i x32 )
__m128i a, b, c, dx0, e, f, g, h;
// (1) left shift x0 by 63, 62 and 57
a = _mm_slli_epi64( x10, 63 );
b = _mm_slli_epi64( x10, 62 );
c = _mm_slli_epi64( x10, 57 );
// (2) compute D xor'ing a, b, c and x1
// d:x0 = x1:x0 ^ [a^b^c:0]
dx0 = _mm_xor_si128( x10, _mm_slli_si128( _mm_xor_si128( _mm_xor_si128( a, b ), c ), 8 ) );
// (3) right shift [d:x0] by 1, 2, 7
e = _mm_or_si128( _mm_srli_epi64( dx0, 1 ), _mm_srli_si128( _mm_slli_epi64( dx0, 63 ), 8 ) );
f = _mm_or_si128( _mm_srli_epi64( dx0, 2 ), _mm_srli_si128( _mm_slli_epi64( dx0, 62 ), 8 ) );
g = _mm_or_si128( _mm_srli_epi64( dx0, 7 ), _mm_srli_si128( _mm_slli_epi64( dx0, 57 ), 8 ) );
// (4) compute h = d^e1^f1^g1 : x0^e0^f0^g0
h = _mm_xor_si128( dx0, _mm_xor_si128( e, _mm_xor_si128( f, g ) ) );
// result is x3^h1:x2^h0
return _mm_xor_si128( x32, h );
* GCM multiplication: c = a times b in GF(2^128)
* Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5.
@ -142,6 +235,22 @@ void mbedtls_aesni_gcm_mult( unsigned char c[16],
const unsigned char a[16],
const unsigned char b[16] )
__m128i xa, xb, m0, m1, x10, x32, r;
xa.m128i_u64[1] = _byteswap_uint64( *((unsigned __int64*)a + 0) );
xa.m128i_u64[0] = _byteswap_uint64( *((unsigned __int64*)a + 1) );
xb.m128i_u64[1] = _byteswap_uint64( *((unsigned __int64*)b + 0) );
xb.m128i_u64[0] = _byteswap_uint64( *((unsigned __int64*)b + 1) );
clmul256( xa, xb, &m0, &m1 );
sll256( m0, m1, &x10, &x32 );
r = reducemod128( x10, x32 );
*((unsigned __int64*)c + 0) = _byteswap_uint64( r.m128i_u64[1] );
*((unsigned __int64*)c + 1) = _byteswap_uint64( r.m128i_u64[0] );
unsigned char aa[16], bb[16], cc[16];
size_t i;
@ -242,6 +351,7 @@ void mbedtls_aesni_gcm_mult( unsigned char c[16],
/* Now byte-reverse the outputs */
for( i = 0; i < 16; i++ )
c[i] = cc[15 - i];
@ -258,22 +368,109 @@ void mbedtls_aesni_inverse_key( unsigned char *invkey,
memcpy( ik, fk, 16 );
for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 )
_mm_storeu_si128( (__m128i*)ik, _mm_aesimc_si128( _mm_loadu_si128( (__m128i*)fk) ) );
asm( "movdqu (%0), %%xmm0 \n\t"
AESIMC xmm0_xmm0 "\n\t"
"movdqu %%xmm0, (%1) \n\t"
: "r" (fk), "r" (ik)
: "memory", "xmm0" );
memcpy( ik, fk, 16 );
inline static __m128i aes_key_128_assist( __m128i key, __m128i kg )
key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) );
key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) );
key = _mm_xor_si128( key, _mm_slli_si128( key, 4 ) );
kg = _mm_shuffle_epi32( kg, _MM_SHUFFLE( 3, 3, 3, 3 ) );
return _mm_xor_si128( key, kg );
// [AES-WP] Part of Fig. 25 page 32
inline static void aes_key_192_assist( __m128i* temp1, __m128i * temp3, __m128i kg )
__m128i temp4;
kg = _mm_shuffle_epi32( kg, 0x55 );
temp4 = _mm_slli_si128( *temp1, 0x4 );
*temp1 = _mm_xor_si128( *temp1, temp4 );
temp4 = _mm_slli_si128( temp4, 0x4 );
*temp1 = _mm_xor_si128( *temp1, temp4 );
temp4 = _mm_slli_si128( temp4, 0x4 );
*temp1 = _mm_xor_si128( *temp1, temp4 );
*temp1 = _mm_xor_si128( *temp1, kg );
kg = _mm_shuffle_epi32( *temp1, 0xff );
temp4 = _mm_slli_si128( *temp3, 0x4 );
*temp3 = _mm_xor_si128( *temp3, temp4 );
*temp3 = _mm_xor_si128( *temp3, kg );
// [AES-WP] Part of Fig. 26 page 34
inline static void aes_key_256_assist_1( __m128i* temp1, __m128i kg )
__m128i temp4;
kg = _mm_shuffle_epi32( kg, 0xff );
temp4 = _mm_slli_si128( *temp1, 0x4 );
*temp1 = _mm_xor_si128( *temp1, temp4 );
temp4 = _mm_slli_si128( temp4, 0x4 );
*temp1 = _mm_xor_si128( *temp1, temp4 );
temp4 = _mm_slli_si128( temp4, 0x4 );
*temp1 = _mm_xor_si128( *temp1, temp4 );
*temp1 = _mm_xor_si128( *temp1, kg );
inline static void aes_key_256_assist_2( __m128i* temp1, __m128i* temp3 )
__m128i temp2, temp4;
temp4 = _mm_aeskeygenassist_si128( *temp1, 0x0 );
temp2 = _mm_shuffle_epi32( temp4, 0xaa );
temp4 = _mm_slli_si128( *temp3, 0x4 );
*temp3 = _mm_xor_si128( *temp3, temp4 );
temp4 = _mm_slli_si128( temp4, 0x4 );
*temp3 = _mm_xor_si128( *temp3, temp4 );
temp4 = _mm_slli_si128( temp4, 0x4 );
*temp3 = _mm_xor_si128( *temp3, temp4 );
*temp3 = _mm_xor_si128( *temp3, temp2 );
* Key expansion, 128-bit case
static void aesni_setkey_enc_128( unsigned char *rk,
const unsigned char *key )
__m128i* xrk, k;
xrk = (__m128i*)rk;
#define EXPAND_ROUND(k, rcon) \
_mm_storeu_si128( xrk++, k ); \
k = aes_key_128_assist( k, _mm_aeskeygenassist_si128( k, rcon ) )
k = _mm_loadu_si128( (__m128i*)key );
EXPAND_ROUND( k, 0x01 );
EXPAND_ROUND( k, 0x02 );
EXPAND_ROUND( k, 0x04 );
EXPAND_ROUND( k, 0x08 );
EXPAND_ROUND( k, 0x10 );
EXPAND_ROUND( k, 0x20 );
EXPAND_ROUND( k, 0x40 );
EXPAND_ROUND( k, 0x80 );
EXPAND_ROUND( k, 0x1b );
EXPAND_ROUND( k, 0x36 );
_mm_storeu_si128( xrk, k );
asm( "movdqu (%1), %%xmm0 \n\t" // copy the original key
"movdqu %%xmm0, (%0) \n\t" // as round key 0
"jmp 2f \n\t" // skip auxiliary routine
@ -316,6 +513,7 @@ static void aesni_setkey_enc_128( unsigned char *rk,
: "r" (rk), "r" (key)
: "memory", "cc", "0" );
@ -324,6 +522,37 @@ static void aesni_setkey_enc_128( unsigned char *rk,
static void aesni_setkey_enc_192( unsigned char *rk,
const unsigned char *key )
__m128i temp1, temp3;
__m128i *key_schedule = (__m128i*)rk;
temp1 = _mm_loadu_si128( (__m128i*)key );
temp3 = _mm_loadu_si128( (__m128i*)(key + 16) );
key_schedule[0] = temp1;
key_schedule[1] = temp3;
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128(temp3, 0x1) );
key_schedule[1] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[1] ), _mm_castsi128_pd( temp1 ), 0 ) );
key_schedule[2] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x2 ) );
key_schedule[3] = temp1;
key_schedule[4] = temp3;
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x4 ) );
key_schedule[4] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[4] ), _mm_castsi128_pd( temp1 ), 0 ) );
key_schedule[5] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x8 ) );
key_schedule[6] = temp1;
key_schedule[7] = temp3;
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x10 ) );
key_schedule[7] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[7] ), _mm_castsi128_pd( temp1 ), 0 ) );
key_schedule[8] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x20 ) );
key_schedule[9] = temp1;
key_schedule[10] = temp3;
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x40 ) );
key_schedule[10] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( key_schedule[10] ), _mm_castsi128_pd( temp1 ), 0 ) );
key_schedule[11] = _mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( temp1 ), _mm_castsi128_pd( temp3 ), 1 ) );
aes_key_192_assist( &temp1, &temp3, _mm_aeskeygenassist_si128( temp3, 0x80 ) );
key_schedule[12] = temp1;
asm( "movdqu (%1), %%xmm0 \n\t" // copy original round key
"movdqu %%xmm0, (%0) \n\t"
"add $16, %0 \n\t"
@ -373,6 +602,7 @@ static void aesni_setkey_enc_192( unsigned char *rk,
: "r" (rk), "r" (key)
: "memory", "cc", "0" );
@ -381,6 +611,40 @@ static void aesni_setkey_enc_192( unsigned char *rk,
static void aesni_setkey_enc_256( unsigned char *rk,
const unsigned char *key )
__m128i temp1, temp3;
__m128i *key_schedule = (__m128i*)rk;
temp1 = _mm_loadu_si128( (__m128i*)key );
temp3 = _mm_loadu_si128( (__m128i*)(key + 16) );
key_schedule[0] = temp1;
key_schedule[1] = temp3;
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x01 ) );
key_schedule[2] = temp1;
aes_key_256_assist_2( &temp1, &temp3 );
key_schedule[3] = temp3;
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x02 ) );
key_schedule[4] = temp1;
aes_key_256_assist_2( &temp1, &temp3 );
key_schedule[5] = temp3;
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x04 ) );
key_schedule[6] = temp1;
aes_key_256_assist_2( &temp1, &temp3 );
key_schedule[7] = temp3;
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x08 ) );
key_schedule[8] = temp1;
aes_key_256_assist_2( &temp1, &temp3 );
key_schedule[9] = temp3;
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x10 ) );
key_schedule[10] = temp1;
aes_key_256_assist_2( &temp1, &temp3 );
key_schedule[11] = temp3;
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x20 ) );
key_schedule[12] = temp1;
aes_key_256_assist_2( &temp1, &temp3 );
key_schedule[13] = temp3;
aes_key_256_assist_1( &temp1, _mm_aeskeygenassist_si128( temp3, 0x40 ) );
key_schedule[14] = temp1;
asm( "movdqu (%1), %%xmm0 \n\t"
"movdqu %%xmm0, (%0) \n\t"
"add $16, %0 \n\t"
@ -439,6 +703,7 @@ static void aesni_setkey_enc_256( unsigned char *rk,
: "r" (rk), "r" (key)
: "memory", "cc", "0" );
@ -112,6 +112,23 @@ unsigned long mbedtls_timing_hardclock( void )
__GNUC__ && ( __amd64__ || __x86_64__ ) */
#if !defined(HAVE_HARDCLOCK) && defined(_MSC_VER) && defined(_M_X64)
unsigned long mbedtls_timing_hardclock(void)
unsigned __int64 u64;
struct { unsigned long lo, hi; } u32;
} tsc;
tsc.u64 = __rdtsc();
return tsc.u32.lo;
#endif /* !HAVE_HARDCLOCK && _MSC_VER && _M_X64 */
#if !defined(HAVE_HARDCLOCK) && defined(MBEDTLS_HAVE_ASM) && \
defined(__GNUC__) && ( defined(__powerpc__) || defined(__ppc__) )
Reference in a new issue