Crypto++  5.6.3
Free C++ class library of cryptographic schemes
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 #include "config.h"
69 
70 #ifndef CRYPTOPP_IMPORTS
71 #ifndef CRYPTOPP_GENERATE_X64_MASMrij
72 
73 #include "rijndael.h"
74 #include "stdcpp.h" // alloca
75 #include "misc.h"
76 #include "cpu.h"
77 
78 NAMESPACE_BEGIN(CryptoPP)
79 
80 // Hack for https://github.com/weidai11/cryptopp/issues/42
81 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
82 # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1
83 #endif
84 
85 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
86 # if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
87 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
88 using namespace rdtable;
89 # else
90 static word64 Te[256];
91 # endif
92 static word64 Td[256];
93 #else
94 static word32 Te[256*4], Td[256*4];
95 #endif
96 static volatile bool s_TeFilled = false, s_TdFilled = false;
97 
98 // ************************* Portable Code ************************************
99 
100 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
101  a ^= L(T, 3, byte(t)); t >>= 8;\
102  b ^= L(T, 2, byte(t)); t >>= 8;\
103  c ^= L(T, 1, byte(t)); t >>= 8;\
104  d ^= L(T, 0, t);
105 
106 #define QUARTER_ROUND_LE(t, a, b, c, d) \
107  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
108  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
109  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
110  tempBlock[d] = ((byte *)(Te+t))[1];
111 
112 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
113  #define QUARTER_ROUND_LD(t, a, b, c, d) \
114  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
115  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
116  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
117  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
118 #else
119  #define QUARTER_ROUND_LD(t, a, b, c, d) \
120  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
121  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
122  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
123  tempBlock[d] = Sd[t];
124 #endif
125 
126 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
127 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
128 
129 #ifdef IS_LITTLE_ENDIAN
130  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
131  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
132  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
133  #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
134  #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
135  #else
136  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
137  #define TL_M(T, i, x) T[i*256 + x]
138  #endif
139 #else
140  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
141  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
142  #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
143  #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
144  #define TL_M TL_F
145  #else
146  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
147  #define TL_M(T, i, x) T[i*256 + x]
148  #endif
149 #endif
150 
151 
152 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
153 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
154 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
155 
156 #define f3(x) (f2(x) ^ x)
157 #define f9(x) (f8(x) ^ x)
158 #define fb(x) (f8(x) ^ f2(x) ^ x)
159 #define fd(x) (f8(x) ^ f4(x) ^ x)
160 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
161 
162 void Rijndael::Base::FillEncTable()
163 {
164  for (int i=0; i<256; i++)
165  {
166  byte x = Se[i];
167 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
168  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
169  Te[i] = word64(y | f3(x))<<32 | y;
170 #else
171  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
172  for (int j=0; j<4; j++)
173  {
174  Te[i+j*256] = y;
175  y = rotrFixed(y, 8);
176  }
177 #endif
178  }
179 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
180  Te[256] = Te[257] = 0;
181 #endif
182  s_TeFilled = true;
183 }
184 
185 void Rijndael::Base::FillDecTable()
186 {
187  for (int i=0; i<256; i++)
188  {
189  byte x = Sd[i];
190 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
191  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
192  Td[i] = word64(y | fb(x))<<32 | y | x;
193 #else
194  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
195  for (int j=0; j<4; j++)
196  {
197  Td[i+j*256] = y;
198  y = rotrFixed(y, 8);
199  }
200 #endif
201  }
202  s_TdFilled = true;
203 }
204 
205 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
206 {
207  AssertValidKeyLength(keylen);
208 
209  m_rounds = keylen/4 + 6;
210  m_key.New(4*(m_rounds+1));
211 
212  word32 *rk = m_key;
213 
214 #if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32))
215  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
216  if (HasAESNI())
217  {
218  static const word32 rcLE[] = {
219  0x01, 0x02, 0x04, 0x08,
220  0x10, 0x20, 0x40, 0x80,
221  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
222  };
223  const word32 *rc = rcLE;
224 
225  __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
226  memcpy(rk, userKey, keylen);
227 
228  while (true)
229  {
230  rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
231  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
232  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
233  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
234 
235  if (rk + keylen/4 + 4 == m_key.end())
236  break;
237 
238  if (keylen == 24)
239  {
240  rk[10] = rk[ 4] ^ rk[ 9];
241  rk[11] = rk[ 5] ^ rk[10];
242  temp = _mm_insert_epi32(temp, rk[11], 3);
243  }
244  else if (keylen == 32)
245  {
246  temp = _mm_insert_epi32(temp, rk[11], 3);
247  rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
248  rk[13] = rk[ 5] ^ rk[12];
249  rk[14] = rk[ 6] ^ rk[13];
250  rk[15] = rk[ 7] ^ rk[14];
251  temp = _mm_insert_epi32(temp, rk[15], 3);
252  }
253  else
254  temp = _mm_insert_epi32(temp, rk[7], 3);
255 
256  rk += keylen/4;
257  }
258 
259  if (!IsForwardTransformation())
260  {
261  rk = m_key;
262  unsigned int i, j;
263 
264  std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
265 
266  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
267  {
268  temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
269  *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
270  *(__m128i *)(rk+j) = temp;
271  }
272 
273  *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
274  }
275 
276  return;
277  }
278 #endif
279 
280  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
281  const word32 *rc = rcon;
282  word32 temp;
283 
284  while (true)
285  {
286  temp = rk[keylen/4-1];
287  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
288  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
289  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
290  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
291  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
292 
293  if (rk + keylen/4 + 4 == m_key.end())
294  break;
295 
296  if (keylen == 24)
297  {
298  rk[10] = rk[ 4] ^ rk[ 9];
299  rk[11] = rk[ 5] ^ rk[10];
300  }
301  else if (keylen == 32)
302  {
303  temp = rk[11];
304  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
305  rk[13] = rk[ 5] ^ rk[12];
306  rk[14] = rk[ 6] ^ rk[13];
307  rk[15] = rk[ 7] ^ rk[14];
308  }
309  rk += keylen/4;
310  }
311 
312  rk = m_key;
313 
314  if (IsForwardTransformation())
315  {
316  if (!s_TeFilled)
317  FillEncTable();
318 
319  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
320  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
321  }
322  else
323  {
324  if (!s_TdFilled)
325  FillDecTable();
326 
327  unsigned int i, j;
328 
329 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
330 
331  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
332  {
333  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
334  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
335  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
336  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
337  }
338 
339  rk[i+0] = InverseMixColumn(rk[i+0]);
340  rk[i+1] = InverseMixColumn(rk[i+1]);
341  rk[i+2] = InverseMixColumn(rk[i+2]);
342  rk[i+3] = InverseMixColumn(rk[i+3]);
343 
344  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
345  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
346  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
347  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
348  }
349 
350 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
351  if (HasAESNI())
352  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
353 #endif
354 }
355 
356 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
357 {
358 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
359 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
360  if (HasSSE2())
361 #else
362  if (HasAESNI())
363 #endif
364  {
365  return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
366  }
367 #endif
368 
370 
371  word32 s0, s1, s2, s3, t0, t1, t2, t3;
372  Block::Get(inBlock)(s0)(s1)(s2)(s3);
373 
374  const word32 *rk = m_key;
375  s0 ^= rk[0];
376  s1 ^= rk[1];
377  s2 ^= rk[2];
378  s3 ^= rk[3];
379  t0 = rk[4];
380  t1 = rk[5];
381  t2 = rk[6];
382  t3 = rk[7];
383  rk += 8;
384 
385  // timing attack countermeasure. see comments at top for more details.
386  // also see http://github.com/weidai11/cryptopp/issues/146
387  const int cacheLineSize = GetCacheLineSize();
388  unsigned int i;
389  volatile word32 _u = 0;
390  word32 u = _u;
391 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
392  for (i=0; i<2048; i+=cacheLineSize)
393 #else
394  for (i=0; i<1024; i+=cacheLineSize)
395 #endif
396  u &= *(const word32 *)(((const byte *)Te)+i);
397  u &= Te[255];
398  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
399 
400  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
401  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
402  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
403  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
404 
405  // Nr - 2 full rounds:
406  unsigned int r = m_rounds/2 - 1;
407  do
408  {
409  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
410 
411  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
412  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
413  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
414  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
415 
416  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
417 
418  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
419  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
420  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
421  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
422 
423  rk += 8;
424  } while (--r);
425 
426  word32 tbw[4];
427  byte *const tempBlock = (byte *)tbw;
428 
429  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
430  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
431  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
432  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
433 
434  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
435 }
436 
437 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
438 {
439 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
440  if (HasAESNI())
441  {
442  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
443  return;
444  }
445 #endif
446 
448 
449  word32 s0, s1, s2, s3, t0, t1, t2, t3;
450  Block::Get(inBlock)(s0)(s1)(s2)(s3);
451 
452  const word32 *rk = m_key;
453  s0 ^= rk[0];
454  s1 ^= rk[1];
455  s2 ^= rk[2];
456  s3 ^= rk[3];
457  t0 = rk[4];
458  t1 = rk[5];
459  t2 = rk[6];
460  t3 = rk[7];
461  rk += 8;
462 
463  // timing attack countermeasure. see comments at top for more details.
464  // also see http://github.com/weidai11/cryptopp/issues/146
465  const int cacheLineSize = GetCacheLineSize();
466  unsigned int i;
467  volatile word32 _u = 0;
468  word32 u = _u;
469 #if defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS)
470  for (i=0; i<2048; i+=cacheLineSize)
471 #else
472  for (i=0; i<1024; i+=cacheLineSize)
473 #endif
474  u &= *(const word32 *)(((const byte *)Td)+i);
475  u &= Td[255];
476  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
477 
478  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
479  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
480  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
481  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
482 
483  // Nr - 2 full rounds:
484  unsigned int r = m_rounds/2 - 1;
485  do
486  {
487  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
488 
489  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
490  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
491  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
492  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
493 
494  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
495 
496  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
497  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
498  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
499  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
500 
501  rk += 8;
502  } while (--r);
503 
504 #if !(defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS) || defined(CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS))
505  // timing attack countermeasure. see comments at top for more details
506  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
507  // QUARTER_ROUND_LD will use Td, which is already preloaded.
508  u = 0;
509  for (i=0; i<256; i+=cacheLineSize)
510  u &= *(const word32 *)(Sd+i);
511  u &= *(const word32 *)(Sd+252);
512  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
513 #endif
514 
515  word32 tbw[4];
516  byte *const tempBlock = (byte *)tbw;
517 
518  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
519  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
520  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
521  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
522 
523  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
524 }
525 
526 // ************************* Assembly Code ************************************
527 
528 #if CRYPTOPP_MSC_VERSION
529 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
530 #endif
531 
532 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
533 
534 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
535 
536 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
537 {
538  CRYPTOPP_UNUSED(locals); CRYPTOPP_UNUSED(k);
539 
540 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
541 
542 #define L_REG esp
543 #define L_INDEX(i) (L_REG+768+i)
544 #define L_INXORBLOCKS L_INBLOCKS+4
545 #define L_OUTXORBLOCKS L_INBLOCKS+8
546 #define L_OUTBLOCKS L_INBLOCKS+12
547 #define L_INCREMENTS L_INDEX(16*15)
548 #define L_SP L_INDEX(16*16)
549 #define L_LENGTH L_INDEX(16*16+4)
550 #define L_KEYS_BEGIN L_INDEX(16*16+8)
551 
552 #define MOVD movd
553 #define MM(i) mm##i
554 
555 #define MXOR(a,b,c) \
556  AS2( movzx esi, b)\
557  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
558  AS2( pxor MM(a), mm7)\
559 
560 #define MMOV(a,b,c) \
561  AS2( movzx esi, b)\
562  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
563 
564 #else
565 
566 #define L_REG r8
567 #define L_INDEX(i) (L_REG+i)
568 #define L_INXORBLOCKS L_INBLOCKS+8
569 #define L_OUTXORBLOCKS L_INBLOCKS+16
570 #define L_OUTBLOCKS L_INBLOCKS+24
571 #define L_INCREMENTS L_INDEX(16*16)
572 #define L_LENGTH L_INDEX(16*18+8)
573 #define L_KEYS_BEGIN L_INDEX(16*19)
574 
575 #define MOVD mov
576 #define MM_0 r9d
577 #define MM_1 r12d
578 #ifdef __GNUC__
579 #define MM_2 r11d
580 #else
581 #define MM_2 r10d
582 #endif
583 #define MM(i) MM_##i
584 
585 #define MXOR(a,b,c) \
586  AS2( movzx esi, b)\
587  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
588 
589 #define MMOV(a,b,c) \
590  AS2( movzx esi, b)\
591  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
592 
593 #endif
594 
595 #define L_SUBKEYS L_INDEX(0)
596 #define L_SAVED_X L_SUBKEYS
597 #define L_KEY12 L_INDEX(16*12)
598 #define L_LASTROUND L_INDEX(16*13)
599 #define L_INBLOCKS L_INDEX(16*14)
600 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
601 
602 #define XOR(a,b,c) \
603  AS2( movzx esi, b)\
604  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
605 
606 #define MOV(a,b,c) \
607  AS2( movzx esi, b)\
608  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
609 
610 #ifdef CRYPTOPP_GENERATE_X64_MASM
611  ALIGN 8
612  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
613  rex_push_reg rsi
614  push_reg rdi
615  push_reg rbx
616  push_reg r12
617  .endprolog
618  mov L_REG, rcx
619  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
620  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
621 #elif defined(__GNUC__)
622  __asm__ __volatile__
623  (
624  INTEL_NOPREFIX
625  #if CRYPTOPP_BOOL_X64
626  AS2( mov L_REG, rcx)
627  #endif
628  AS_PUSH_IF86(bx)
629  AS_PUSH_IF86(bp)
630  AS2( mov AS_REG_7, WORD_REG(si))
631 #else
632  AS_PUSH_IF86(si)
633  AS_PUSH_IF86(di)
634  AS_PUSH_IF86(bx)
635  AS_PUSH_IF86(bp)
636  AS2( lea AS_REG_7, [Te])
637  AS2( mov edi, [g_cacheLineSize])
638 #endif
639 
640 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
641  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
642  AS2( lea esp, [ecx-768])
643 #endif
644 
645  // copy subkeys to stack
646  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
647  AS2( mov WORD_REG(ax), 16)
648  AS2( and WORD_REG(ax), WORD_REG(si))
649  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
650  AS2( movdqa [L_KEY12], xmm3)
651  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
652  AS2( sub WORD_REG(ax), WORD_REG(si))
653  ASL(0)
654  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
655  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
656  AS2( add WORD_REG(si), 16)
657  AS2( cmp WORD_REG(si), 16*12)
658  ASJ( jl, 0, b)
659 
660  // read subkeys 0, 1 and last
661  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
662  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
663  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
664  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
665  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
666  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
667 
668  // load table into cache
669  AS2( xor WORD_REG(ax), WORD_REG(ax))
670  ASL(9)
671  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
672  AS2( add WORD_REG(ax), WORD_REG(di))
673  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
674  AS2( add WORD_REG(ax), WORD_REG(di))
675  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
676  AS2( add WORD_REG(ax), WORD_REG(di))
677  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
678  AS2( add WORD_REG(ax), WORD_REG(di))
679  AS2( cmp WORD_REG(ax), 2048)
680  ASJ( jl, 9, b)
681  AS1( lfence)
682 
683  AS2( test DWORD PTR [L_LENGTH], 1)
684  ASJ( jz, 8, f)
685 
686  // counter mode one-time setup
687  AS2( mov WORD_REG(si), [L_INBLOCKS])
688  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
689  AS2( pxor xmm2, xmm1)
690  AS2( psrldq xmm1, 14)
691  AS2( movd eax, xmm1)
692  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
693  AS2( MOVD MM(2), eax)
694 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
695  AS2( mov eax, 1)
696  AS2( movd mm3, eax)
697 #endif
698 
699  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
700  AS2( movd eax, xmm2)
701  AS2( psrldq xmm2, 4)
702  AS2( movd edi, xmm2)
703  AS2( psrldq xmm2, 4)
704  MXOR( 1, al, 0) // 0
705  XOR( edx, ah, 1) // 1
706  AS2( shr eax, 16)
707  XOR( ecx, al, 2) // 2
708  XOR( ebx, ah, 3) // 3
709  AS2( mov eax, edi)
710  AS2( movd edi, xmm2)
711  AS2( psrldq xmm2, 4)
712  XOR( ebx, al, 0) // 4
713  MXOR( 1, ah, 1) // 5
714  AS2( shr eax, 16)
715  XOR( edx, al, 2) // 6
716  XOR( ecx, ah, 3) // 7
717  AS2( mov eax, edi)
718  AS2( movd edi, xmm2)
719  XOR( ecx, al, 0) // 8
720  XOR( ebx, ah, 1) // 9
721  AS2( shr eax, 16)
722  MXOR( 1, al, 2) // 10
723  XOR( edx, ah, 3) // 11
724  AS2( mov eax, edi)
725  XOR( edx, al, 0) // 12
726  XOR( ecx, ah, 1) // 13
727  AS2( shr eax, 16)
728  XOR( ebx, al, 2) // 14
729  AS2( psrldq xmm2, 3)
730 
731  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
732  AS2( mov eax, [L_KEY12+0*4])
733  AS2( mov edi, [L_KEY12+2*4])
734  AS2( MOVD MM(0), [L_KEY12+3*4])
735  MXOR( 0, cl, 3) /* 11 */
736  XOR( edi, bl, 3) /* 7 */
737  MXOR( 0, bh, 2) /* 6 */
738  AS2( shr ebx, 16) /* 4,5 */
739  XOR( eax, bl, 1) /* 5 */
740  MOV( ebx, bh, 0) /* 4 */
741  AS2( xor ebx, [L_KEY12+1*4])
742  XOR( eax, ch, 2) /* 10 */
743  AS2( shr ecx, 16) /* 8,9 */
744  XOR( eax, dl, 3) /* 15 */
745  XOR( ebx, dh, 2) /* 14 */
746  AS2( shr edx, 16) /* 12,13 */
747  XOR( edi, ch, 0) /* 8 */
748  XOR( ebx, cl, 1) /* 9 */
749  XOR( edi, dl, 1) /* 13 */
750  MXOR( 0, dh, 0) /* 12 */
751 
752  AS2( movd ecx, xmm2)
753  AS2( MOVD edx, MM(1))
754  AS2( MOVD [L_SAVED_X+3*4], MM(0))
755  AS2( mov [L_SAVED_X+0*4], eax)
756  AS2( mov [L_SAVED_X+1*4], ebx)
757  AS2( mov [L_SAVED_X+2*4], edi)
758  ASJ( jmp, 5, f)
759 
760  ASL(3)
761  // non-counter mode per-block setup
762  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
763  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
764  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
765  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
766  ASL(8)
767  AS2( mov WORD_REG(ax), [L_INBLOCKS])
768  AS2( movdqu xmm2, [WORD_REG(ax)])
769  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
770  AS2( movdqu xmm5, [WORD_REG(si)])
771  AS2( pxor xmm2, xmm1)
772  AS2( pxor xmm2, xmm5)
773 
774  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
775  AS2( movd eax, xmm2)
776  AS2( psrldq xmm2, 4)
777  AS2( movd edi, xmm2)
778  AS2( psrldq xmm2, 4)
779  MXOR( 1, al, 0) // 0
780  XOR( edx, ah, 1) // 1
781  AS2( shr eax, 16)
782  XOR( ecx, al, 2) // 2
783  XOR( ebx, ah, 3) // 3
784  AS2( mov eax, edi)
785  AS2( movd edi, xmm2)
786  AS2( psrldq xmm2, 4)
787  XOR( ebx, al, 0) // 4
788  MXOR( 1, ah, 1) // 5
789  AS2( shr eax, 16)
790  XOR( edx, al, 2) // 6
791  XOR( ecx, ah, 3) // 7
792  AS2( mov eax, edi)
793  AS2( movd edi, xmm2)
794  XOR( ecx, al, 0) // 8
795  XOR( ebx, ah, 1) // 9
796  AS2( shr eax, 16)
797  MXOR( 1, al, 2) // 10
798  XOR( edx, ah, 3) // 11
799  AS2( mov eax, edi)
800  XOR( edx, al, 0) // 12
801  XOR( ecx, ah, 1) // 13
802  AS2( shr eax, 16)
803  XOR( ebx, al, 2) // 14
804  MXOR( 1, ah, 3) // 15
805  AS2( MOVD eax, MM(1))
806 
807  AS2( add L_REG, [L_KEYS_BEGIN])
808  AS2( add L_REG, 4*16)
809  ASJ( jmp, 2, f)
810 
811  ASL(1)
812  // counter-mode per-block setup
813  AS2( MOVD ecx, MM(2))
814  AS2( MOVD edx, MM(1))
815  AS2( mov eax, [L_SAVED_X+0*4])
816  AS2( mov ebx, [L_SAVED_X+1*4])
817  AS2( xor cl, ch)
818  AS2( and WORD_REG(cx), 255)
819  ASL(5)
820 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
821  AS2( paddb MM(2), mm3)
822 #else
823  AS2( add MM(2), 1)
824 #endif
825  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
826  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
827  XOR( ebx, dl, 3)
828  MOV( ecx, dh, 2)
829  AS2( shr edx, 16)
830  AS2( xor ecx, [L_SAVED_X+2*4])
831  XOR( eax, dh, 0)
832  MOV( edx, dl, 1)
833  AS2( xor edx, [L_SAVED_X+3*4])
834 
835  AS2( add L_REG, [L_KEYS_BEGIN])
836  AS2( add L_REG, 3*16)
837  ASJ( jmp, 4, f)
838 
839 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
840 // out: eax, ebx, edi, mm0
841 #define ROUND() \
842  MXOR( 0, cl, 3) /* 11 */\
843  AS2( mov cl, al) /* 8,9,10,3 */\
844  XOR( edi, ah, 2) /* 2 */\
845  AS2( shr eax, 16) /* 0,1 */\
846  XOR( edi, bl, 3) /* 7 */\
847  MXOR( 0, bh, 2) /* 6 */\
848  AS2( shr ebx, 16) /* 4,5 */\
849  MXOR( 0, al, 1) /* 1 */\
850  MOV( eax, ah, 0) /* 0 */\
851  XOR( eax, bl, 1) /* 5 */\
852  MOV( ebx, bh, 0) /* 4 */\
853  XOR( eax, ch, 2) /* 10 */\
854  XOR( ebx, cl, 3) /* 3 */\
855  AS2( shr ecx, 16) /* 8,9 */\
856  XOR( eax, dl, 3) /* 15 */\
857  XOR( ebx, dh, 2) /* 14 */\
858  AS2( shr edx, 16) /* 12,13 */\
859  XOR( edi, ch, 0) /* 8 */\
860  XOR( ebx, cl, 1) /* 9 */\
861  XOR( edi, dl, 1) /* 13 */\
862  MXOR( 0, dh, 0) /* 12 */\
863 
864  ASL(2) // 2-round loop
865  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
866  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
867  ROUND()
868  AS2( mov ecx, edi)
869  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
870  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
871  AS2( MOVD edx, MM(0))
872 
873  ASL(4)
874  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
875  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
876  ROUND()
877  AS2( mov ecx, edi)
878  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
879  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
880  AS2( MOVD edx, MM(0))
881 
882  AS2( add L_REG, 32)
883  AS2( test L_REG, 255)
884  ASJ( jnz, 2, b)
885  AS2( sub L_REG, 16*16)
886 
887 #define LAST(a, b, c) \
888  AS2( movzx esi, a )\
889  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
890  AS2( movzx esi, b )\
891  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
892  AS2( mov WORD PTR [L_LASTROUND+c], di )\
893 
894  // last round
895  LAST(ch, dl, 2)
896  LAST(dh, al, 6)
897  AS2( shr edx, 16)
898  LAST(ah, bl, 10)
899  AS2( shr eax, 16)
900  LAST(bh, cl, 14)
901  AS2( shr ebx, 16)
902  LAST(dh, al, 12)
903  AS2( shr ecx, 16)
904  LAST(ah, bl, 0)
905  LAST(bh, cl, 4)
906  LAST(ch, dl, 8)
907 
908  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
909  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
910 
911  AS2( mov WORD_REG(cx), [L_LENGTH])
912  AS2( sub WORD_REG(cx), 16)
913 
914  AS2( movdqu xmm2, [WORD_REG(ax)])
915  AS2( pxor xmm2, xmm4)
916 
917 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
918  AS2( movdqa xmm0, [L_INCREMENTS])
919  AS2( paddd xmm0, [L_INBLOCKS])
920  AS2( movdqa [L_INBLOCKS], xmm0)
921 #else
922  AS2( movdqa xmm0, [L_INCREMENTS+16])
923  AS2( paddq xmm0, [L_INBLOCKS+16])
924  AS2( movdqa [L_INBLOCKS+16], xmm0)
925 #endif
926 
927  AS2( pxor xmm2, [L_LASTROUND])
928  AS2( movdqu [WORD_REG(bx)], xmm2)
929 
930  ASJ( jle, 7, f)
931  AS2( mov [L_LENGTH], WORD_REG(cx))
932  AS2( test WORD_REG(cx), 1)
933  ASJ( jnz, 1, b)
934 #if CRYPTOPP_BOOL_X64
935  AS2( movdqa xmm0, [L_INCREMENTS])
936  AS2( paddq xmm0, [L_INBLOCKS])
937  AS2( movdqa [L_INBLOCKS], xmm0)
938 #endif
939  ASJ( jmp, 3, b)
940 
941  ASL(7)
942  // erase keys on stack
943  AS2( xorps xmm0, xmm0)
944  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
945  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
946  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
947  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
948  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
949  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
950  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
951  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
952  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
953  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
954  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
955  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
956  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
957  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
958  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
959 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
960  AS2( mov esp, [L_SP])
961  AS1( emms)
962 #endif
963  AS_POP_IF86(bp)
964  AS_POP_IF86(bx)
965 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
966  AS_POP_IF86(di)
967  AS_POP_IF86(si)
968  AS1(ret)
969 #endif
970 #ifdef CRYPTOPP_GENERATE_X64_MASM
971  pop r12
972  pop rbx
973  pop rdi
974  pop rsi
975  ret
976  Rijndael_Enc_AdvancedProcessBlocks ENDP
977 #endif
978 #ifdef __GNUC__
979  ATT_PREFIX
980  :
981  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
982  : "memory", "cc", "%eax"
983  #if CRYPTOPP_BOOL_X64
984  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
985  #endif
986  );
987 #endif
988 }
989 
990 #endif
991 
992 #ifndef CRYPTOPP_GENERATE_X64_MASM
993 
994 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
995 extern "C" {
996 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
997 }
998 #endif
999 
1000 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
1001 
1002 static inline bool AliasedWithTable(const byte *begin, const byte *end)
1003 {
1004  size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
1005  size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
1006  if (t1 > t0)
1007  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
1008  else
1009  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
1010 }
1011 
1012 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1013 
1014 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1015 {
1016  block = _mm_xor_si128(block, subkeys[0]);
1017  for (unsigned int i=1; i<rounds-1; i+=2)
1018  {
1019  block = _mm_aesenc_si128(block, subkeys[i]);
1020  block = _mm_aesenc_si128(block, subkeys[i+1]);
1021  }
1022  block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1023  block = _mm_aesenclast_si128(block, subkeys[rounds]);
1024 }
1025 
1026 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1027 {
1028  __m128i rk = subkeys[0];
1029  block0 = _mm_xor_si128(block0, rk);
1030  block1 = _mm_xor_si128(block1, rk);
1031  block2 = _mm_xor_si128(block2, rk);
1032  block3 = _mm_xor_si128(block3, rk);
1033  for (unsigned int i=1; i<rounds; i++)
1034  {
1035  rk = subkeys[i];
1036  block0 = _mm_aesenc_si128(block0, rk);
1037  block1 = _mm_aesenc_si128(block1, rk);
1038  block2 = _mm_aesenc_si128(block2, rk);
1039  block3 = _mm_aesenc_si128(block3, rk);
1040  }
1041  rk = subkeys[rounds];
1042  block0 = _mm_aesenclast_si128(block0, rk);
1043  block1 = _mm_aesenclast_si128(block1, rk);
1044  block2 = _mm_aesenclast_si128(block2, rk);
1045  block3 = _mm_aesenclast_si128(block3, rk);
1046 }
1047 
1048 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1049 {
1050  block = _mm_xor_si128(block, subkeys[0]);
1051  for (unsigned int i=1; i<rounds-1; i+=2)
1052  {
1053  block = _mm_aesdec_si128(block, subkeys[i]);
1054  block = _mm_aesdec_si128(block, subkeys[i+1]);
1055  }
1056  block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1057  block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1058 }
1059 
1060 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1061 {
1062  __m128i rk = subkeys[0];
1063  block0 = _mm_xor_si128(block0, rk);
1064  block1 = _mm_xor_si128(block1, rk);
1065  block2 = _mm_xor_si128(block2, rk);
1066  block3 = _mm_xor_si128(block3, rk);
1067  for (unsigned int i=1; i<rounds; i++)
1068  {
1069  rk = subkeys[i];
1070  block0 = _mm_aesdec_si128(block0, rk);
1071  block1 = _mm_aesdec_si128(block1, rk);
1072  block2 = _mm_aesdec_si128(block2, rk);
1073  block3 = _mm_aesdec_si128(block3, rk);
1074  }
1075  rk = subkeys[rounds];
1076  block0 = _mm_aesdeclast_si128(block0, rk);
1077  block1 = _mm_aesdeclast_si128(block1, rk);
1078  block2 = _mm_aesdeclast_si128(block2, rk);
1079  block3 = _mm_aesdeclast_si128(block3, rk);
1080 }
1081 
1082 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
1083 
1084 template <typename F1, typename F4>
1085 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1086 {
1087  size_t blockSize = 16;
1089  size_t xorIncrement = xorBlocks ? blockSize : 0;
1090  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1091 
1093  {
1094  assert(length % blockSize == 0);
1095  inBlocks += length - blockSize;
1096  xorBlocks += length - blockSize;
1097  outBlocks += length - blockSize;
1098  inIncrement = 0-inIncrement;
1099  xorIncrement = 0-xorIncrement;
1100  outIncrement = 0-outIncrement;
1101  }
1102 
1103  if (flags & BlockTransformation::BT_AllowParallel)
1104  {
1105  while (length >= 4*blockSize)
1106  {
1107  __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
1109  {
1110  const __m128i be1 = *(const __m128i *)s_one;
1111  block1 = _mm_add_epi32(block0, be1);
1112  block2 = _mm_add_epi32(block1, be1);
1113  block3 = _mm_add_epi32(block2, be1);
1114  _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
1115  }
1116  else
1117  {
1118  inBlocks += inIncrement;
1119  block1 = _mm_loadu_si128((const __m128i *)inBlocks);
1120  inBlocks += inIncrement;
1121  block2 = _mm_loadu_si128((const __m128i *)inBlocks);
1122  inBlocks += inIncrement;
1123  block3 = _mm_loadu_si128((const __m128i *)inBlocks);
1124  inBlocks += inIncrement;
1125  }
1126 
1127  if (flags & BlockTransformation::BT_XorInput)
1128  {
1129  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
1130  xorBlocks += xorIncrement;
1131  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
1132  xorBlocks += xorIncrement;
1133  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
1134  xorBlocks += xorIncrement;
1135  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
1136  xorBlocks += xorIncrement;
1137  }
1138 
1139  func4(block0, block1, block2, block3, subkeys, rounds);
1140 
1141  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1142  {
1143  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
1144  xorBlocks += xorIncrement;
1145  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
1146  xorBlocks += xorIncrement;
1147  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
1148  xorBlocks += xorIncrement;
1149  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
1150  xorBlocks += xorIncrement;
1151  }
1152 
1153  _mm_storeu_si128((__m128i *)outBlocks, block0);
1154  outBlocks += outIncrement;
1155  _mm_storeu_si128((__m128i *)outBlocks, block1);
1156  outBlocks += outIncrement;
1157  _mm_storeu_si128((__m128i *)outBlocks, block2);
1158  outBlocks += outIncrement;
1159  _mm_storeu_si128((__m128i *)outBlocks, block3);
1160  outBlocks += outIncrement;
1161 
1162  length -= 4*blockSize;
1163  }
1164  }
1165 
1166  while (length >= blockSize)
1167  {
1168  __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
1169 
1170  if (flags & BlockTransformation::BT_XorInput)
1171  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
1172 
1173  if (flags & BlockTransformation::BT_InBlockIsCounter)
1174  const_cast<byte *>(inBlocks)[15]++;
1175 
1176  func1(block, subkeys, rounds);
1177 
1178  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1179  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
1180 
1181  _mm_storeu_si128((__m128i *)outBlocks, block);
1182 
1183  inBlocks += inIncrement;
1184  outBlocks += outIncrement;
1185  xorBlocks += xorIncrement;
1186  length -= blockSize;
1187  }
1188 
1189  return length;
1190 }
1191 #endif
1192 
1193 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1194 {
1195 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1196  if (HasAESNI())
1197  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1198 #endif
1199 
1200 #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM)
1201  if (HasSSE2())
1202  {
1203  if (length < BLOCKSIZE)
1204  return length;
1205 
1206  struct Locals
1207  {
1208  word32 subkeys[4*12], workspace[8];
1209  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1210  byte *outBlocks;
1211  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1212  size_t regSpill, lengthAndCounterFlag, keysBegin;
1213  };
1214 
1215  size_t increment = BLOCKSIZE;
1216  const byte* zeros = (byte *)(Te+256);
1217  byte *space;
1218 
1219  do {
1220  // https://msdn.microsoft.com/en-us/library/5471dc8s.aspx
1221 #if (CRYPTOPP_MSC_VERION >= 1400)
1222  space = (byte *)_malloca(255+sizeof(Locals));
1223  space += (256-(size_t)space%256)%256;
1224 #else
1225  space = (byte *)alloca(255+sizeof(Locals));
1226  space += (256-(size_t)space%256)%256;
1227 #endif
1228  }
1229  while (AliasedWithTable(space, space+sizeof(Locals)));
1230 
1231  if (flags & BT_ReverseDirection)
1232  {
1233  assert(length % BLOCKSIZE == 0);
1234  inBlocks += length - BLOCKSIZE;
1235  xorBlocks += length - BLOCKSIZE;
1236  outBlocks += length - BLOCKSIZE;
1237  increment = 0-increment;
1238  }
1239 
1240  Locals &locals = *(Locals *)space;
1241 
1242  locals.inBlocks = inBlocks;
1243  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1244  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1245  locals.outBlocks = outBlocks;
1246 
1247  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1248  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1249  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1250  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1251 
1252  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1253  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1254  locals.keysBegin = (12-keysToCopy)*16;
1255 
1256  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1257  return length % BLOCKSIZE;
1258  }
1259 #endif
1260 
1261  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1262 }
1263 
1264 #endif
1265 
1266 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1267 
1268 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1269 {
1270  if (HasAESNI())
1271  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1272 
1273  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1274 }
1275 
1276 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1277 
1278 NAMESPACE_END
1279 
1280 #endif
1281 #endif
Utility functions for the Crypto++ library.
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:723
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
Encrypt and xor multiple blocks using additional flags.
Definition: cryptlib.cpp:181
Rijndael block cipher implementation details.
Definition: rijndael.h:28
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
Definition: misc.h:1695
Classes for Rijndael encryption algorithm.
Classes, functions, intrinsics and features for X86, X32 nd X64 assembly.
perform the transformation in reverse
Definition: cryptlib.h:727
Crypto++ library namespace.
T rotrFixed(T x, unsigned int y)
Performs a right rotate.
Definition: misc.h:1189
Interface for retrieving values given their names.
Definition: cryptlib.h:261