/* AesOpt.c -- Intel's AES 2017-06-08 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "CpuArch.h" #ifdef MY_CPU_X86_OR_AMD64 #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) #define USE_INTEL_AES #endif #endif #ifdef USE_INTEL_AES #include <wmmintrin.h> void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks) { __m128i m = *p; for (; numBlocks != 0; numBlocks--, data++) { UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; const __m128i *w = p + 3; m = _mm_xor_si128(m, *data); m = _mm_xor_si128(m, p[2]); do { m = _mm_aesenc_si128(m, w[0]); m = _mm_aesenc_si128(m, w[1]); w += 2; } while (--numRounds2 != 0); m = _mm_aesenc_si128(m, w[0]); m = _mm_aesenclast_si128(m, w[1]); *data = m; } *p = m; } #define NUM_WAYS 3 #define AES_OP_W(op, n) { \ const __m128i t = w[n]; \ m0 = op(m0, t); \ m1 = op(m1, t); \ m2 = op(m2, t); \ } #define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n) #define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n) #define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n) #define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n) void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks) { __m128i iv = *p; for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS) { UInt32 numRounds2 = *(const UInt32 *)(p + 1); const __m128i *w = p + numRounds2 * 2; __m128i m0, m1, m2; { const __m128i t = w[2]; m0 = _mm_xor_si128(t, data[0]); m1 = _mm_xor_si128(t, data[1]); m2 = _mm_xor_si128(t, data[2]); } numRounds2--; do { AES_DEC(1) AES_DEC(0) w -= 2; } while (--numRounds2 != 0); AES_DEC(1) AES_DEC_LAST(0) { __m128i t; t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t; t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t; t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t; } } for (; numBlocks != 0; numBlocks--, data++) { UInt32 numRounds2 = *(const UInt32 *)(p + 1); const __m128i *w = p + numRounds2 * 2; __m128i m = _mm_xor_si128(w[2], *data); numRounds2--; do { m = _mm_aesdec_si128(m, w[1]); m = _mm_aesdec_si128(m, w[0]); w -= 2; } while (--numRounds2 != 0); m = _mm_aesdec_si128(m, w[1]); m = _mm_aesdeclast_si128(m, w[0]); m = _mm_xor_si128(m, iv); iv = *data; *data = m; } *p = iv; } void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks) { __m128i ctr = *p; __m128i one; one.m128i_u64[0] = 1; one.m128i_u64[1] = 0; for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS) { UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; const __m128i *w = p; __m128i m0, m1, m2; { const __m128i t = w[2]; ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t); ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t); ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t); } w += 3; do { AES_ENC(0) AES_ENC(1) w += 2; } while (--numRounds2 != 0); AES_ENC(0) AES_ENC_LAST(1) data[0] = _mm_xor_si128(data[0], m0); data[1] = _mm_xor_si128(data[1], m1); data[2] = _mm_xor_si128(data[2], m2); } for (; numBlocks != 0; numBlocks--, data++) { UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; const __m128i *w = p; __m128i m; ctr = _mm_add_epi64(ctr, one); m = _mm_xor_si128(ctr, p[2]); w += 3; do { m = _mm_aesenc_si128(m, w[0]); m = _mm_aesenc_si128(m, w[1]); w += 2; } while (--numRounds2 != 0); m = _mm_aesenc_si128(m, w[0]); m = _mm_aesenclast_si128(m, w[1]); *data = _mm_xor_si128(*data, m); } *p = ctr; } #else void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks); void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks); void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks); void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks) { AesCbc_Encode(p, data, numBlocks); } void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks) { AesCbc_Decode(p, data, numBlocks); } void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks) { AesCtr_Code(p, data, numBlocks); } #endif