Crypto++
|
00001 // gcm.cpp - written and placed in the public domain by Wei Dai 00002 00003 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM gcm.cpp" to generate MASM code 00004 00005 #include "pch.h" 00006 00007 #ifndef CRYPTOPP_IMPORTS 00008 #ifndef CRYPTOPP_GENERATE_X64_MASM 00009 00010 #include "gcm.h" 00011 #include "cpu.h" 00012 00013 NAMESPACE_BEGIN(CryptoPP) 00014 00015 word16 GCM_Base::s_reductionTable[256]; 00016 volatile bool GCM_Base::s_reductionTableInitialized = false; 00017 00018 void GCM_Base::GCTR::IncrementCounterBy256() 00019 { 00020 IncrementCounterByOne(m_counterArray+BlockSize()-4, 3); 00021 } 00022 00023 #if 0 00024 // preserved for testing 00025 void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c) 00026 { 00027 word64 Z0=0, Z1=0, V0, V1; 00028 00029 typedef BlockGetAndPut<word64, BigEndian> Block; 00030 Block::Get(a)(V0)(V1); 00031 00032 for (int i=0; i<16; i++) 00033 { 00034 for (int j=0x80; j!=0; j>>=1) 00035 { 00036 int x = b[i] & j; 00037 Z0 ^= x ? V0 : 0; 00038 Z1 ^= x ? V1 : 0; 00039 x = (int)V1 & 1; 00040 V1 = (V1>>1) | (V0<<63); 00041 V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0); 00042 } 00043 } 00044 Block::Put(NULL, c)(Z0)(Z1); 00045 } 00046 00047 __m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i) 00048 { 00049 word64 A[1] = {ByteReverse(((word64*)&a)[i&1])}; 00050 word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])}; 00051 00052 PolynomialMod2 pa((byte *)A, 8); 00053 PolynomialMod2 pb((byte *)B, 8); 00054 PolynomialMod2 c = pa*pb; 00055 00056 __m128i output; 00057 for (int i=0; i<16; i++) 00058 ((byte *)&output)[i] = c.GetByte(i); 00059 return output; 00060 } 00061 #endif 00062 00063 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00064 inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c) 00065 { 00066 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 00067 *(__m128i *)a = _mm_xor_si128(*(__m128i *)b, *(__m128i *)c); 00068 #else 00069 asm ("movdqa %1, %%xmm0; pxor %2, %%xmm0; movdqa %%xmm0, %0;" : "=m" (a[0]) : "m"(b[0]), "m"(c[0])); 00070 #endif 00071 } 00072 #endif 00073 00074 inline static void Xor16(byte *a, const byte *b, const byte *c) 00075 { 00076 ((word64 *)a)[0] = ((word64 *)b)[0] ^ ((word64 *)c)[0]; 00077 ((word64 *)a)[1] = ((word64 *)b)[1] ^ ((word64 *)c)[1]; 00078 } 00079 00080 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 00081 static CRYPTOPP_ALIGN_DATA(16) const word64 s_clmulConstants64[] = { 00082 W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), 00083 W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), 00084 W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; 00085 static const __m128i *s_clmulConstants = (const __m128i *)s_clmulConstants64; 00086 static const unsigned int s_clmulTableSizeInBlocks = 8; 00087 00088 inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r) 00089 { 00090 /* 00091 The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most 00092 significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the 00093 rightmost bit positions, and the lowest byte addresses. 00094 00095 c1 ^= c0t * 0xc200000000000000 00096 c2t ^= c0t 00097 t = shift (c1t ^ c0b) left 1 bit 00098 c2 ^= t * 0xe100000000000000 00099 c2t ^= c1b 00100 shift c2 left 1 bit and xor in lowest bit of c1t 00101 */ 00102 #if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301 00103 c2 = _mm_xor_si128(c2, _mm_move_epi64(c0)); 00104 #else 00105 c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8)); 00106 #endif 00107 c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10)); 00108 c0 = _mm_srli_si128(c0, 8); 00109 c0 = _mm_xor_si128(c0, c1); 00110 c0 = _mm_slli_epi64(c0, 1); 00111 c0 = _mm_clmulepi64_si128(c0, r, 0); 00112 c2 = _mm_xor_si128(c2, c0); 00113 c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8)); 00114 c1 = _mm_unpacklo_epi64(c1, c2); 00115 c1 = _mm_srli_epi64(c1, 63); 00116 c2 = _mm_slli_epi64(c2, 1); 00117 return _mm_xor_si128(c2, c1); 00118 } 00119 00120 inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r) 00121 { 00122 __m128i c0 = _mm_clmulepi64_si128(x,h,0); 00123 __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10)); 00124 __m128i c2 = _mm_clmulepi64_si128(x,h,0x11); 00125 00126 return CLMUL_Reduce(c0, c1, c2, r); 00127 } 00128 #endif 00129 00130 void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms) 00131 { 00132 BlockCipher &blockCipher = AccessBlockCipher(); 00133 blockCipher.SetKey(userKey, keylength, params); 00134 00135 if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE) 00136 throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16"); 00137 00138 int tableSize, i, j, k; 00139 00140 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 00141 if (HasCLMUL()) 00142 { 00143 params.GetIntValue(Name::TableSize(), tableSize); // avoid "parameter not used" error 00144 tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE; 00145 } 00146 else 00147 #endif 00148 { 00149 if (params.GetIntValue(Name::TableSize(), tableSize)) 00150 tableSize = (tableSize >= 64*1024) ? 64*1024 : 2*1024; 00151 else 00152 tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024; 00153 00154 #if defined(_MSC_VER) && (_MSC_VER >= 1300 && _MSC_VER < 1400) 00155 // VC 2003 workaround: compiler generates bad code for 64K tables 00156 tableSize = 2*1024; 00157 #endif 00158 } 00159 00160 m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize); 00161 byte *table = MulTable(); 00162 byte *hashKey = HashKey(); 00163 memset(hashKey, 0, REQUIRED_BLOCKSIZE); 00164 blockCipher.ProcessBlock(hashKey); 00165 00166 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 00167 if (HasCLMUL()) 00168 { 00169 const __m128i r = s_clmulConstants[0]; 00170 __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)hashKey), s_clmulConstants[1]); 00171 __m128i h = h0; 00172 00173 for (i=0; i<tableSize; i+=32) 00174 { 00175 __m128i h1 = CLMUL_GF_Mul(h, h0, r); 00176 _mm_storel_epi64((__m128i *)(table+i), h); 00177 _mm_storeu_si128((__m128i *)(table+i+16), h1); 00178 _mm_storeu_si128((__m128i *)(table+i+8), h); 00179 _mm_storel_epi64((__m128i *)(table+i+8), h1); 00180 h = CLMUL_GF_Mul(h1, h0, r); 00181 } 00182 00183 return; 00184 } 00185 #endif 00186 00187 word64 V0, V1; 00188 typedef BlockGetAndPut<word64, BigEndian> Block; 00189 Block::Get(hashKey)(V0)(V1); 00190 00191 if (tableSize == 64*1024) 00192 { 00193 for (i=0; i<128; i++) 00194 { 00195 k = i%8; 00196 Block::Put(NULL, table+(i/8)*256*16+(size_t(1)<<(11-k)))(V0)(V1); 00197 00198 int x = (int)V1 & 1; 00199 V1 = (V1>>1) | (V0<<63); 00200 V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0); 00201 } 00202 00203 for (i=0; i<16; i++) 00204 { 00205 memset(table+i*256*16, 0, 16); 00206 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00207 if (HasSSE2()) 00208 for (j=2; j<=0x80; j*=2) 00209 for (k=1; k<j; k++) 00210 SSE2_Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16); 00211 else 00212 #endif 00213 for (j=2; j<=0x80; j*=2) 00214 for (k=1; k<j; k++) 00215 Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16); 00216 } 00217 } 00218 else 00219 { 00220 if (!s_reductionTableInitialized) 00221 { 00222 s_reductionTable[0] = 0; 00223 word16 x = 0x01c2; 00224 s_reductionTable[1] = ByteReverse(x); 00225 for (int i=2; i<=0x80; i*=2) 00226 { 00227 x <<= 1; 00228 s_reductionTable[i] = ByteReverse(x); 00229 for (int j=1; j<i; j++) 00230 s_reductionTable[i+j] = s_reductionTable[i] ^ s_reductionTable[j]; 00231 } 00232 s_reductionTableInitialized = true; 00233 } 00234 00235 for (i=0; i<128-24; i++) 00236 { 00237 k = i%32; 00238 if (k < 4) 00239 Block::Put(NULL, table+1024+(i/32)*256+(size_t(1)<<(7-k)))(V0)(V1); 00240 else if (k < 8) 00241 Block::Put(NULL, table+(i/32)*256+(size_t(1)<<(11-k)))(V0)(V1); 00242 00243 int x = (int)V1 & 1; 00244 V1 = (V1>>1) | (V0<<63); 00245 V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0); 00246 } 00247 00248 for (i=0; i<4; i++) 00249 { 00250 memset(table+i*256, 0, 16); 00251 memset(table+1024+i*256, 0, 16); 00252 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00253 if (HasSSE2()) 00254 for (j=2; j<=8; j*=2) 00255 for (k=1; k<j; k++) 00256 { 00257 SSE2_Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16); 00258 SSE2_Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16); 00259 } 00260 else 00261 #endif 00262 for (j=2; j<=8; j*=2) 00263 for (k=1; k<j; k++) 00264 { 00265 Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16); 00266 Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16); 00267 } 00268 } 00269 } 00270 } 00271 00272 inline void GCM_Base::ReverseHashBufferIfNeeded() 00273 { 00274 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 00275 if (HasCLMUL()) 00276 { 00277 __m128i &x = *(__m128i *)HashBuffer(); 00278 x = _mm_shuffle_epi8(x, s_clmulConstants[1]); 00279 } 00280 #endif 00281 } 00282 00283 void GCM_Base::Resync(const byte *iv, size_t len) 00284 { 00285 BlockCipher &cipher = AccessBlockCipher(); 00286 byte *hashBuffer = HashBuffer(); 00287 00288 if (len == 12) 00289 { 00290 memcpy(hashBuffer, iv, len); 00291 memset(hashBuffer+len, 0, 3); 00292 hashBuffer[len+3] = 1; 00293 } 00294 else 00295 { 00296 size_t origLen = len; 00297 memset(hashBuffer, 0, HASH_BLOCKSIZE); 00298 00299 if (len >= HASH_BLOCKSIZE) 00300 { 00301 len = GCM_Base::AuthenticateBlocks(iv, len); 00302 iv += (origLen - len); 00303 } 00304 00305 if (len > 0) 00306 { 00307 memcpy(m_buffer, iv, len); 00308 memset(m_buffer+len, 0, HASH_BLOCKSIZE-len); 00309 GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE); 00310 } 00311 00312 PutBlock<word64, BigEndian, true>(NULL, m_buffer)(0)(origLen*8); 00313 GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE); 00314 00315 ReverseHashBufferIfNeeded(); 00316 } 00317 00318 if (m_state >= State_IVSet) 00319 m_ctr.Resynchronize(hashBuffer, REQUIRED_BLOCKSIZE); 00320 else 00321 m_ctr.SetCipherWithIV(cipher, hashBuffer); 00322 00323 m_ctr.Seek(HASH_BLOCKSIZE); 00324 00325 memset(hashBuffer, 0, HASH_BLOCKSIZE); 00326 } 00327 00328 unsigned int GCM_Base::OptimalDataAlignment() const 00329 { 00330 return 00331 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) 00332 HasSSE2() ? 16 : 00333 #endif 00334 GetBlockCipher().OptimalDataAlignment(); 00335 } 00336 00337 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code 00338 00339 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM 00340 00341 #ifdef CRYPTOPP_X64_MASM_AVAILABLE 00342 extern "C" { 00343 void GCM_AuthenticateBlocks_2K(const byte *data, size_t blocks, word64 *hashBuffer, const word16 *reductionTable); 00344 void GCM_AuthenticateBlocks_64K(const byte *data, size_t blocks, word64 *hashBuffer); 00345 } 00346 #endif 00347 00348 #ifndef CRYPTOPP_GENERATE_X64_MASM 00349 00350 size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len) 00351 { 00352 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 00353 if (HasCLMUL()) 00354 { 00355 const __m128i *table = (const __m128i *)MulTable(); 00356 __m128i x = _mm_load_si128((__m128i *)HashBuffer()); 00357 const __m128i r = s_clmulConstants[0], bswapMask = s_clmulConstants[1], bswapMask2 = s_clmulConstants[2]; 00358 00359 while (len >= 16) 00360 { 00361 size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0; 00362 __m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-1)*16)), bswapMask2);; 00363 __m128i c0 = _mm_setzero_si128(); 00364 __m128i c1 = _mm_setzero_si128(); 00365 __m128i c2 = _mm_setzero_si128(); 00366 00367 while (true) 00368 { 00369 __m128i h0 = _mm_load_si128(table+i); 00370 __m128i h1 = _mm_load_si128(table+i+1); 00371 __m128i h01 = _mm_xor_si128(h0, h1); 00372 00373 if (++i == s) 00374 { 00375 d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask); 00376 d = _mm_xor_si128(d, x); 00377 c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0)); 00378 c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1)); 00379 d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2))); 00380 c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0)); 00381 break; 00382 } 00383 00384 d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask2); 00385 c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); 00386 c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1)); 00387 d2 = _mm_xor_si128(d2, d); 00388 c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h01, 1)); 00389 00390 if (++i == s) 00391 { 00392 d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)data), bswapMask); 00393 d = _mm_xor_si128(d, x); 00394 c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10)); 00395 c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 0x11)); 00396 d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2))); 00397 c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10)); 00398 break; 00399 } 00400 00401 d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(data+(s-i)*16-8)), bswapMask); 00402 c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10)); 00403 c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); 00404 d = _mm_xor_si128(d, d2); 00405 c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10)); 00406 } 00407 data += s*16; 00408 len -= s*16; 00409 00410 c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2); 00411 x = CLMUL_Reduce(c0, c1, c2, r); 00412 } 00413 00414 _mm_store_si128((__m128i *)HashBuffer(), x); 00415 return len; 00416 } 00417 #endif 00418 00419 typedef BlockGetAndPut<word64, NativeByteOrder> Block; 00420 word64 *hashBuffer = (word64 *)HashBuffer(); 00421 00422 switch (2*(m_buffer.size()>=64*1024) 00423 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) 00424 + HasSSE2() 00425 #endif 00426 ) 00427 { 00428 case 0: // non-SSE2 and 2K tables 00429 { 00430 byte *table = MulTable(); 00431 word64 x0 = hashBuffer[0], x1 = hashBuffer[1]; 00432 00433 do 00434 { 00435 word64 y0, y1, a0, a1, b0, b1, c0, c1, d0, d1; 00436 Block::Get(data)(y0)(y1); 00437 x0 ^= y0; 00438 x1 ^= y1; 00439 00440 data += HASH_BLOCKSIZE; 00441 len -= HASH_BLOCKSIZE; 00442 00443 #define READ_TABLE_WORD64_COMMON(a, b, c, d) *(word64 *)(table+(a*1024)+(b*256)+c+d*8) 00444 00445 #ifdef IS_LITTLE_ENDIAN 00446 #if CRYPTOPP_BOOL_SLOW_WORD64 00447 word32 z0 = (word32)x0; 00448 word32 z1 = (word32)(x0>>32); 00449 word32 z2 = (word32)x1; 00450 word32 z3 = (word32)(x1>>32); 00451 #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((d%2), c, (d?(z##c>>((d?d-1:0)*4))&0xf0:(z##c&0xf)<<4), e) 00452 #else 00453 #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((d%2), c, ((d+8*b)?(x##a>>(((d+8*b)?(d+8*b)-1:1)*4))&0xf0:(x##a&0xf)<<4), e) 00454 #endif 00455 #define GF_MOST_SIG_8BITS(a) (a##1 >> 7*8) 00456 #define GF_SHIFT_8(a) a##1 = (a##1 << 8) ^ (a##0 >> 7*8); a##0 <<= 8; 00457 #else 00458 #define READ_TABLE_WORD64(a, b, c, d, e) READ_TABLE_WORD64_COMMON((1-d%2), c, ((15-d-8*b)?(x##a>>(((15-d-8*b)?(15-d-8*b)-1:0)*4))&0xf0:(x##a&0xf)<<4), e) 00459 #define GF_MOST_SIG_8BITS(a) (a##1 & 0xff) 00460 #define GF_SHIFT_8(a) a##1 = (a##1 >> 8) ^ (a##0 << 7*8); a##0 >>= 8; 00461 #endif 00462 00463 #define GF_MUL_32BY128(op, a, b, c) \ 00464 a0 op READ_TABLE_WORD64(a, b, c, 0, 0) ^ READ_TABLE_WORD64(a, b, c, 1, 0);\ 00465 a1 op READ_TABLE_WORD64(a, b, c, 0, 1) ^ READ_TABLE_WORD64(a, b, c, 1, 1);\ 00466 b0 op READ_TABLE_WORD64(a, b, c, 2, 0) ^ READ_TABLE_WORD64(a, b, c, 3, 0);\ 00467 b1 op READ_TABLE_WORD64(a, b, c, 2, 1) ^ READ_TABLE_WORD64(a, b, c, 3, 1);\ 00468 c0 op READ_TABLE_WORD64(a, b, c, 4, 0) ^ READ_TABLE_WORD64(a, b, c, 5, 0);\ 00469 c1 op READ_TABLE_WORD64(a, b, c, 4, 1) ^ READ_TABLE_WORD64(a, b, c, 5, 1);\ 00470 d0 op READ_TABLE_WORD64(a, b, c, 6, 0) ^ READ_TABLE_WORD64(a, b, c, 7, 0);\ 00471 d1 op READ_TABLE_WORD64(a, b, c, 6, 1) ^ READ_TABLE_WORD64(a, b, c, 7, 1);\ 00472 00473 GF_MUL_32BY128(=, 0, 0, 0) 00474 GF_MUL_32BY128(^=, 0, 1, 1) 00475 GF_MUL_32BY128(^=, 1, 0, 2) 00476 GF_MUL_32BY128(^=, 1, 1, 3) 00477 00478 word32 r = (word32)s_reductionTable[GF_MOST_SIG_8BITS(d)] << 16; 00479 GF_SHIFT_8(d) 00480 c0 ^= d0; c1 ^= d1; 00481 r ^= (word32)s_reductionTable[GF_MOST_SIG_8BITS(c)] << 8; 00482 GF_SHIFT_8(c) 00483 b0 ^= c0; b1 ^= c1; 00484 r ^= s_reductionTable[GF_MOST_SIG_8BITS(b)]; 00485 GF_SHIFT_8(b) 00486 a0 ^= b0; a1 ^= b1; 00487 a0 ^= ConditionalByteReverse<word64>(LITTLE_ENDIAN_ORDER, r); 00488 x0 = a0; x1 = a1; 00489 } 00490 while (len >= HASH_BLOCKSIZE); 00491 00492 hashBuffer[0] = x0; hashBuffer[1] = x1; 00493 return len; 00494 } 00495 00496 case 2: // non-SSE2 and 64K tables 00497 { 00498 byte *table = MulTable(); 00499 word64 x0 = hashBuffer[0], x1 = hashBuffer[1]; 00500 00501 do 00502 { 00503 word64 y0, y1, a0, a1; 00504 Block::Get(data)(y0)(y1); 00505 x0 ^= y0; 00506 x1 ^= y1; 00507 00508 data += HASH_BLOCKSIZE; 00509 len -= HASH_BLOCKSIZE; 00510 00511 #undef READ_TABLE_WORD64_COMMON 00512 #undef READ_TABLE_WORD64 00513 00514 #define READ_TABLE_WORD64_COMMON(a, c, d) *(word64 *)(table+(a)*256*16+(c)+(d)*8) 00515 00516 #ifdef IS_LITTLE_ENDIAN 00517 #if CRYPTOPP_BOOL_SLOW_WORD64 00518 word32 z0 = (word32)x0; 00519 word32 z1 = (word32)(x0>>32); 00520 word32 z2 = (word32)x1; 00521 word32 z3 = (word32)(x1>>32); 00522 #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, (d?(z##c>>((d?d:1)*8-4))&0xff0:(z##c&0xff)<<4), e) 00523 #else 00524 #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, ((d+4*(c%2))?(x##b>>(((d+4*(c%2))?(d+4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e) 00525 #endif 00526 #else 00527 #define READ_TABLE_WORD64(b, c, d, e) READ_TABLE_WORD64_COMMON(c*4+d, ((7-d-4*(c%2))?(x##b>>(((7-d-4*(c%2))?(7-d-4*(c%2)):1)*8-4))&0xff0:(x##b&0xff)<<4), e) 00528 #endif 00529 00530 #define GF_MUL_8BY128(op, b, c, d) \ 00531 a0 op READ_TABLE_WORD64(b, c, d, 0);\ 00532 a1 op READ_TABLE_WORD64(b, c, d, 1);\ 00533 00534 GF_MUL_8BY128(=, 0, 0, 0) 00535 GF_MUL_8BY128(^=, 0, 0, 1) 00536 GF_MUL_8BY128(^=, 0, 0, 2) 00537 GF_MUL_8BY128(^=, 0, 0, 3) 00538 GF_MUL_8BY128(^=, 0, 1, 0) 00539 GF_MUL_8BY128(^=, 0, 1, 1) 00540 GF_MUL_8BY128(^=, 0, 1, 2) 00541 GF_MUL_8BY128(^=, 0, 1, 3) 00542 GF_MUL_8BY128(^=, 1, 2, 0) 00543 GF_MUL_8BY128(^=, 1, 2, 1) 00544 GF_MUL_8BY128(^=, 1, 2, 2) 00545 GF_MUL_8BY128(^=, 1, 2, 3) 00546 GF_MUL_8BY128(^=, 1, 3, 0) 00547 GF_MUL_8BY128(^=, 1, 3, 1) 00548 GF_MUL_8BY128(^=, 1, 3, 2) 00549 GF_MUL_8BY128(^=, 1, 3, 3) 00550 00551 x0 = a0; x1 = a1; 00552 } 00553 while (len >= HASH_BLOCKSIZE); 00554 00555 hashBuffer[0] = x0; hashBuffer[1] = x1; 00556 return len; 00557 } 00558 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM 00559 00560 #ifdef CRYPTOPP_X64_MASM_AVAILABLE 00561 case 1: // SSE2 and 2K tables 00562 GCM_AuthenticateBlocks_2K(data, len/16, hashBuffer, s_reductionTable); 00563 return len % 16; 00564 case 3: // SSE2 and 64K tables 00565 GCM_AuthenticateBlocks_64K(data, len/16, hashBuffer); 00566 return len % 16; 00567 #endif 00568 00569 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 00570 case 1: // SSE2 and 2K tables 00571 { 00572 #ifdef __GNUC__ 00573 __asm__ __volatile__ 00574 ( 00575 ".intel_syntax noprefix;" 00576 #elif defined(CRYPTOPP_GENERATE_X64_MASM) 00577 ALIGN 8 00578 GCM_AuthenticateBlocks_2K PROC FRAME 00579 rex_push_reg rsi 00580 push_reg rdi 00581 push_reg rbx 00582 .endprolog 00583 mov rsi, r8 00584 mov r11, r9 00585 #else 00586 AS2( mov WORD_REG(cx), data ) 00587 AS2( mov WORD_REG(dx), len ) 00588 AS2( mov WORD_REG(si), hashBuffer ) 00589 AS2( shr WORD_REG(dx), 4 ) 00590 #endif 00591 00592 AS_PUSH_IF86( bx) 00593 AS_PUSH_IF86( bp) 00594 00595 #ifdef __GNUC__ 00596 AS2( mov AS_REG_7, WORD_REG(di)) 00597 #elif CRYPTOPP_BOOL_X86 00598 AS2( lea AS_REG_7, s_reductionTable) 00599 #endif 00600 00601 AS2( movdqa xmm0, [WORD_REG(si)] ) 00602 00603 #define MUL_TABLE_0 WORD_REG(si) + 32 00604 #define MUL_TABLE_1 WORD_REG(si) + 32 + 1024 00605 #define RED_TABLE AS_REG_7 00606 00607 ASL(0) 00608 AS2( movdqu xmm4, [WORD_REG(cx)] ) 00609 AS2( pxor xmm0, xmm4 ) 00610 00611 AS2( movd ebx, xmm0 ) 00612 AS2( mov eax, AS_HEX(f0f0f0f0) ) 00613 AS2( and eax, ebx ) 00614 AS2( shl ebx, 4 ) 00615 AS2( and ebx, AS_HEX(f0f0f0f0) ) 00616 AS2( movzx edi, ah ) 00617 AS2( movdqa xmm5, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] ) 00618 AS2( movzx edi, al ) 00619 AS2( movdqa xmm4, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] ) 00620 AS2( shr eax, 16 ) 00621 AS2( movzx edi, ah ) 00622 AS2( movdqa xmm3, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] ) 00623 AS2( movzx edi, al ) 00624 AS2( movdqa xmm2, XMMWORD_PTR [MUL_TABLE_1 + WORD_REG(di)] ) 00625 00626 #define SSE2_MUL_32BITS(i) \ 00627 AS2( psrldq xmm0, 4 )\ 00628 AS2( movd eax, xmm0 )\ 00629 AS2( and eax, AS_HEX(f0f0f0f0) )\ 00630 AS2( movzx edi, bh )\ 00631 AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\ 00632 AS2( movzx edi, bl )\ 00633 AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\ 00634 AS2( shr ebx, 16 )\ 00635 AS2( movzx edi, bh )\ 00636 AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\ 00637 AS2( movzx edi, bl )\ 00638 AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_0 + (i-1)*256 + WORD_REG(di)] )\ 00639 AS2( movd ebx, xmm0 )\ 00640 AS2( shl ebx, 4 )\ 00641 AS2( and ebx, AS_HEX(f0f0f0f0) )\ 00642 AS2( movzx edi, ah )\ 00643 AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\ 00644 AS2( movzx edi, al )\ 00645 AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\ 00646 AS2( shr eax, 16 )\ 00647 AS2( movzx edi, ah )\ 00648 AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\ 00649 AS2( movzx edi, al )\ 00650 AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_1 + i*256 + WORD_REG(di)] )\ 00651 00652 SSE2_MUL_32BITS(1) 00653 SSE2_MUL_32BITS(2) 00654 SSE2_MUL_32BITS(3) 00655 00656 AS2( movzx edi, bh ) 00657 AS2( pxor xmm5, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] ) 00658 AS2( movzx edi, bl ) 00659 AS2( pxor xmm4, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] ) 00660 AS2( shr ebx, 16 ) 00661 AS2( movzx edi, bh ) 00662 AS2( pxor xmm3, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] ) 00663 AS2( movzx edi, bl ) 00664 AS2( pxor xmm2, XMMWORD_PTR [MUL_TABLE_0 + 3*256 + WORD_REG(di)] ) 00665 00666 AS2( movdqa xmm0, xmm3 ) 00667 AS2( pslldq xmm3, 1 ) 00668 AS2( pxor xmm2, xmm3 ) 00669 AS2( movdqa xmm1, xmm2 ) 00670 AS2( pslldq xmm2, 1 ) 00671 AS2( pxor xmm5, xmm2 ) 00672 00673 AS2( psrldq xmm0, 15 ) 00674 AS2( movd WORD_REG(di), xmm0 ) 00675 AS2( movzx eax, WORD PTR [RED_TABLE + WORD_REG(di)*2] ) 00676 AS2( shl eax, 8 ) 00677 00678 AS2( movdqa xmm0, xmm5 ) 00679 AS2( pslldq xmm5, 1 ) 00680 AS2( pxor xmm4, xmm5 ) 00681 00682 AS2( psrldq xmm1, 15 ) 00683 AS2( movd WORD_REG(di), xmm1 ) 00684 AS2( xor ax, WORD PTR [RED_TABLE + WORD_REG(di)*2] ) 00685 AS2( shl eax, 8 ) 00686 00687 AS2( psrldq xmm0, 15 ) 00688 AS2( movd WORD_REG(di), xmm0 ) 00689 AS2( xor ax, WORD PTR [RED_TABLE + WORD_REG(di)*2] ) 00690 00691 AS2( movd xmm0, eax ) 00692 AS2( pxor xmm0, xmm4 ) 00693 00694 AS2( add WORD_REG(cx), 16 ) 00695 AS2( sub WORD_REG(dx), 1 ) 00696 ASJ( jnz, 0, b ) 00697 AS2( movdqa [WORD_REG(si)], xmm0 ) 00698 00699 AS_POP_IF86( bp) 00700 AS_POP_IF86( bx) 00701 00702 #ifdef __GNUC__ 00703 ".att_syntax prefix;" 00704 : 00705 : "c" (data), "d" (len/16), "S" (hashBuffer), "D" (s_reductionTable) 00706 : "memory", "cc", "%eax" 00707 #if CRYPTOPP_BOOL_X64 00708 , "%ebx", "%r11" 00709 #endif 00710 ); 00711 #elif defined(CRYPTOPP_GENERATE_X64_MASM) 00712 pop rbx 00713 pop rdi 00714 pop rsi 00715 ret 00716 GCM_AuthenticateBlocks_2K ENDP 00717 #endif 00718 00719 return len%16; 00720 } 00721 case 3: // SSE2 and 64K tables 00722 { 00723 #ifdef __GNUC__ 00724 __asm__ __volatile__ 00725 ( 00726 ".intel_syntax noprefix;" 00727 #elif defined(CRYPTOPP_GENERATE_X64_MASM) 00728 ALIGN 8 00729 GCM_AuthenticateBlocks_64K PROC FRAME 00730 rex_push_reg rsi 00731 push_reg rdi 00732 .endprolog 00733 mov rsi, r8 00734 #else 00735 AS2( mov WORD_REG(cx), data ) 00736 AS2( mov WORD_REG(dx), len ) 00737 AS2( mov WORD_REG(si), hashBuffer ) 00738 AS2( shr WORD_REG(dx), 4 ) 00739 #endif 00740 00741 AS2( movdqa xmm0, [WORD_REG(si)] ) 00742 00743 #undef MUL_TABLE 00744 #define MUL_TABLE(i,j) WORD_REG(si) + 32 + (i*4+j)*256*16 00745 00746 ASL(1) 00747 AS2( movdqu xmm1, [WORD_REG(cx)] ) 00748 AS2( pxor xmm1, xmm0 ) 00749 AS2( pxor xmm0, xmm0 ) 00750 00751 #undef SSE2_MUL_32BITS 00752 #define SSE2_MUL_32BITS(i) \ 00753 AS2( movd eax, xmm1 )\ 00754 AS2( psrldq xmm1, 4 )\ 00755 AS2( movzx edi, al )\ 00756 AS2( add WORD_REG(di), WORD_REG(di) )\ 00757 AS2( pxor xmm0, [MUL_TABLE(i,0) + WORD_REG(di)*8] )\ 00758 AS2( movzx edi, ah )\ 00759 AS2( add WORD_REG(di), WORD_REG(di) )\ 00760 AS2( pxor xmm0, [MUL_TABLE(i,1) + WORD_REG(di)*8] )\ 00761 AS2( shr eax, 16 )\ 00762 AS2( movzx edi, al )\ 00763 AS2( add WORD_REG(di), WORD_REG(di) )\ 00764 AS2( pxor xmm0, [MUL_TABLE(i,2) + WORD_REG(di)*8] )\ 00765 AS2( movzx edi, ah )\ 00766 AS2( add WORD_REG(di), WORD_REG(di) )\ 00767 AS2( pxor xmm0, [MUL_TABLE(i,3) + WORD_REG(di)*8] )\ 00768 00769 SSE2_MUL_32BITS(0) 00770 SSE2_MUL_32BITS(1) 00771 SSE2_MUL_32BITS(2) 00772 SSE2_MUL_32BITS(3) 00773 00774 AS2( add WORD_REG(cx), 16 ) 00775 AS2( sub WORD_REG(dx), 1 ) 00776 ASJ( jnz, 1, b ) 00777 AS2( movdqa [WORD_REG(si)], xmm0 ) 00778 00779 #ifdef __GNUC__ 00780 ".att_syntax prefix;" 00781 : 00782 : "c" (data), "d" (len/16), "S" (hashBuffer) 00783 : "memory", "cc", "%edi", "%eax" 00784 ); 00785 #elif defined(CRYPTOPP_GENERATE_X64_MASM) 00786 pop rdi 00787 pop rsi 00788 ret 00789 GCM_AuthenticateBlocks_64K ENDP 00790 #endif 00791 00792 return len%16; 00793 } 00794 #endif 00795 #ifndef CRYPTOPP_GENERATE_X64_MASM 00796 } 00797 00798 return len%16; 00799 } 00800 00801 void GCM_Base::AuthenticateLastHeaderBlock() 00802 { 00803 if (m_bufferedDataLength > 0) 00804 { 00805 memset(m_buffer+m_bufferedDataLength, 0, HASH_BLOCKSIZE-m_bufferedDataLength); 00806 m_bufferedDataLength = 0; 00807 GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE); 00808 } 00809 } 00810 00811 void GCM_Base::AuthenticateLastConfidentialBlock() 00812 { 00813 GCM_Base::AuthenticateLastHeaderBlock(); 00814 PutBlock<word64, BigEndian, true>(NULL, m_buffer)(m_totalHeaderLength*8)(m_totalMessageLength*8); 00815 GCM_Base::AuthenticateBlocks(m_buffer, HASH_BLOCKSIZE); 00816 } 00817 00818 void GCM_Base::AuthenticateLastFooterBlock(byte *mac, size_t macSize) 00819 { 00820 m_ctr.Seek(0); 00821 ReverseHashBufferIfNeeded(); 00822 m_ctr.ProcessData(mac, HashBuffer(), macSize); 00823 } 00824 00825 NAMESPACE_END 00826 00827 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM 00828 #endif