Crypto++
rijndael.cpp
1 // rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>
2 // and Wei Dai from Paulo Baretto's Rijndael implementation
3 // The original code and all modifications are in the public domain.
4 
5 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code
6 
7 /*
8 July 2010: Added support for AES-NI instructions via compiler intrinsics.
9 */
10 
11 /*
12 Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode
13 caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein
14 and Peter Schwabe in their paper "New AES software speed records". The round
15 function was also modified to include a trick similar to one in Brian Gladman's
16 x86 assembly code, doing an 8-bit register move to minimize the number of
17 register spills. Also switched to compressed tables and copying round keys to
18 the stack.
19 
20 The C++ implementation now uses compressed tables if
21 CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.
22 */
23 
24 /*
25 July 2006: Defense against timing attacks was added in by Wei Dai.
26 
27 The code now uses smaller tables in the first and last rounds,
28 and preloads them into L1 cache before usage (by loading at least
29 one element in each cache line).
30 
31 We try to delay subsequent accesses to each table (used in the first
32 and last rounds) until all of the table has been preloaded. Hopefully
33 the compiler isn't smart enough to optimize that code away.
34 
35 After preloading the table, we also try not to access any memory location
36 other than the table and the stack, in order to prevent table entries from
37 being unloaded from L1 cache, until that round is finished.
38 (Some popular CPUs have 2-way associative caches.)
39 */
40 
41 // This is the original introductory comment:
42 
43 /**
44  * version 3.0 (December 2000)
45  *
46  * Optimised ANSI C code for the Rijndael cipher (now AES)
47  *
48  * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
49  * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
50  * author Paulo Barreto <paulo.barreto@terra.com.br>
51  *
52  * This code is hereby placed in the public domain.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
55  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
56  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
58  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
59  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
60  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
61  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
62  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
63  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
64  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65  */
66 
67 #include "pch.h"
68 
69 #ifndef CRYPTOPP_IMPORTS
70 #ifndef CRYPTOPP_GENERATE_X64_MASM
71 
72 #include "rijndael.h"
73 #include "misc.h"
74 #include "cpu.h"
75 
76 NAMESPACE_BEGIN(CryptoPP)
77 
78 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
79 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
80 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
81 using namespace rdtable;
82 #else
83 static word64 Te[256];
84 #endif
85 static word64 Td[256];
86 #else
87 static word32 Te[256*4], Td[256*4];
88 #endif
89 static volatile bool s_TeFilled = false, s_TdFilled = false;
90 
91 // ************************* Portable Code ************************************
92 
93 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
94  a ^= L(T, 3, byte(t)); t >>= 8;\
95  b ^= L(T, 2, byte(t)); t >>= 8;\
96  c ^= L(T, 1, byte(t)); t >>= 8;\
97  d ^= L(T, 0, t);
98 
99 #define QUARTER_ROUND_LE(t, a, b, c, d) \
100  tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
101  tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
102  tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
103  tempBlock[d] = ((byte *)(Te+t))[1];
104 
105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
106  #define QUARTER_ROUND_LD(t, a, b, c, d) \
107  tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
108  tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
109  tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
110  tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
111 #else
112  #define QUARTER_ROUND_LD(t, a, b, c, d) \
113  tempBlock[a] = Sd[byte(t)]; t >>= 8;\
114  tempBlock[b] = Sd[byte(t)]; t >>= 8;\
115  tempBlock[c] = Sd[byte(t)]; t >>= 8;\
116  tempBlock[d] = Sd[t];
117 #endif
118 
119 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
120 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
121 
122 #ifdef IS_LITTLE_ENDIAN
123  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
124  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
125  #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
126  #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
127  #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
128  #else
129  #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
130  #define TL_M(T, i, x) T[i*256 + x]
131  #endif
132 #else
133  #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
134  #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
135  #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
136  #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
137  #define TL_M TL_F
138  #else
139  #define TL_F(T, i, x) rotrFixed(T[x], i*8)
140  #define TL_M(T, i, x) T[i*256 + x]
141  #endif
142 #endif
143 
144 
145 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
146 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
147 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
148 
149 #define f3(x) (f2(x) ^ x)
150 #define f9(x) (f8(x) ^ x)
151 #define fb(x) (f8(x) ^ f2(x) ^ x)
152 #define fd(x) (f8(x) ^ f4(x) ^ x)
153 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
154 
155 void Rijndael::Base::FillEncTable()
156 {
157  for (int i=0; i<256; i++)
158  {
159  byte x = Se[i];
160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
161  word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
162  Te[i] = word64(y | f3(x))<<32 | y;
163 #else
164  word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
165  for (int j=0; j<4; j++)
166  {
167  Te[i+j*256] = y;
168  y = rotrFixed(y, 8);
169  }
170 #endif
171  }
172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
173  Te[256] = Te[257] = 0;
174 #endif
175  s_TeFilled = true;
176 }
177 
178 void Rijndael::Base::FillDecTable()
179 {
180  for (int i=0; i<256; i++)
181  {
182  byte x = Sd[i];
183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
184  word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
185  Td[i] = word64(y | fb(x))<<32 | y | x;
186 #else
187  word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
188  for (int j=0; j<4; j++)
189  {
190  Td[i+j*256] = y;
191  y = rotrFixed(y, 8);
192  }
193 #endif
194  }
195  s_TdFilled = true;
196 }
197 
198 void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &)
199 {
200  AssertValidKeyLength(keylen);
201 
202  m_rounds = keylen/4 + 6;
203  m_key.New(4*(m_rounds+1));
204 
205  word32 *rk = m_key;
206 
207 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
208  // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64
209  if (HasAESNI())
210  {
211  static const word32 rcLE[] = {
212  0x01, 0x02, 0x04, 0x08,
213  0x10, 0x20, 0x40, 0x80,
214  0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
215  };
216  const word32 *rc = rcLE;
217 
218  __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
219  memcpy(rk, userKey, keylen);
220 
221  while (true)
222  {
223  rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
224  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
225  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
226  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
227 
228  if (rk + keylen/4 + 4 == m_key.end())
229  break;
230 
231  if (keylen == 24)
232  {
233  rk[10] = rk[ 4] ^ rk[ 9];
234  rk[11] = rk[ 5] ^ rk[10];
235  temp = _mm_insert_epi32(temp, rk[11], 3);
236  }
237  else if (keylen == 32)
238  {
239  temp = _mm_insert_epi32(temp, rk[11], 3);
240  rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
241  rk[13] = rk[ 5] ^ rk[12];
242  rk[14] = rk[ 6] ^ rk[13];
243  rk[15] = rk[ 7] ^ rk[14];
244  temp = _mm_insert_epi32(temp, rk[15], 3);
245  }
246  else
247  temp = _mm_insert_epi32(temp, rk[7], 3);
248 
249  rk += keylen/4;
250  }
251 
252  if (!IsForwardTransformation())
253  {
254  rk = m_key;
255  unsigned int i, j;
256 
257  std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
258 
259  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
260  {
261  temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
262  *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
263  *(__m128i *)(rk+j) = temp;
264  }
265 
266  *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
267  }
268 
269  return;
270  }
271 #endif
272 
273  GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
274  const word32 *rc = rcon;
275  word32 temp;
276 
277  while (true)
278  {
279  temp = rk[keylen/4-1];
280  word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
281  rk[keylen/4] = rk[0] ^ x ^ *(rc++);
282  rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
283  rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
284  rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
285 
286  if (rk + keylen/4 + 4 == m_key.end())
287  break;
288 
289  if (keylen == 24)
290  {
291  rk[10] = rk[ 4] ^ rk[ 9];
292  rk[11] = rk[ 5] ^ rk[10];
293  }
294  else if (keylen == 32)
295  {
296  temp = rk[11];
297  rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
298  rk[13] = rk[ 5] ^ rk[12];
299  rk[14] = rk[ 6] ^ rk[13];
300  rk[15] = rk[ 7] ^ rk[14];
301  }
302  rk += keylen/4;
303  }
304 
305  rk = m_key;
306 
307  if (IsForwardTransformation())
308  {
309  if (!s_TeFilled)
310  FillEncTable();
311 
312  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
313  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
314  }
315  else
316  {
317  if (!s_TdFilled)
318  FillDecTable();
319 
320  unsigned int i, j;
321 
322 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
323 
324  for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
325  {
326  temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
327  temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
328  temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
329  temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
330  }
331 
332  rk[i+0] = InverseMixColumn(rk[i+0]);
333  rk[i+1] = InverseMixColumn(rk[i+1]);
334  rk[i+2] = InverseMixColumn(rk[i+2]);
335  rk[i+3] = InverseMixColumn(rk[i+3]);
336 
337  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
338  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
339  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
340  temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
341  }
342 
343 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
344  if (HasAESNI())
345  ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
346 #endif
347 }
348 
349 void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
350 {
351 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
352 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
353  if (HasSSE2())
354 #else
355  if (HasAESNI())
356 #endif
357  {
358  Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
359  return;
360  }
361 #endif
362 
364 
365  word32 s0, s1, s2, s3, t0, t1, t2, t3;
366  Block::Get(inBlock)(s0)(s1)(s2)(s3);
367 
368  const word32 *rk = m_key;
369  s0 ^= rk[0];
370  s1 ^= rk[1];
371  s2 ^= rk[2];
372  s3 ^= rk[3];
373  t0 = rk[4];
374  t1 = rk[5];
375  t2 = rk[6];
376  t3 = rk[7];
377  rk += 8;
378 
379  // timing attack countermeasure. see comments at top for more details
380  const int cacheLineSize = GetCacheLineSize();
381  unsigned int i;
382  word32 u = 0;
383 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
384  for (i=0; i<2048; i+=cacheLineSize)
385 #else
386  for (i=0; i<1024; i+=cacheLineSize)
387 #endif
388  u &= *(const word32 *)(((const byte *)Te)+i);
389  u &= Te[255];
390  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
391 
392  QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
393  QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
394  QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
395  QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
396 
397  // Nr - 2 full rounds:
398  unsigned int r = m_rounds/2 - 1;
399  do
400  {
401  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
402 
403  QUARTER_ROUND_E(t3, s0, s1, s2, s3)
404  QUARTER_ROUND_E(t2, s3, s0, s1, s2)
405  QUARTER_ROUND_E(t1, s2, s3, s0, s1)
406  QUARTER_ROUND_E(t0, s1, s2, s3, s0)
407 
408  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
409 
410  QUARTER_ROUND_E(s3, t0, t1, t2, t3)
411  QUARTER_ROUND_E(s2, t3, t0, t1, t2)
412  QUARTER_ROUND_E(s1, t2, t3, t0, t1)
413  QUARTER_ROUND_E(s0, t1, t2, t3, t0)
414 
415  rk += 8;
416  } while (--r);
417 
418  word32 tbw[4];
419  byte *const tempBlock = (byte *)tbw;
420 
421  QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
422  QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
423  QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
424  QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
425 
426  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
427 }
428 
429 void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
430 {
431 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
432  if (HasAESNI())
433  {
434  Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0);
435  return;
436  }
437 #endif
438 
440 
441  word32 s0, s1, s2, s3, t0, t1, t2, t3;
442  Block::Get(inBlock)(s0)(s1)(s2)(s3);
443 
444  const word32 *rk = m_key;
445  s0 ^= rk[0];
446  s1 ^= rk[1];
447  s2 ^= rk[2];
448  s3 ^= rk[3];
449  t0 = rk[4];
450  t1 = rk[5];
451  t2 = rk[6];
452  t3 = rk[7];
453  rk += 8;
454 
455  // timing attack countermeasure. see comments at top for more details
456  const int cacheLineSize = GetCacheLineSize();
457  unsigned int i;
458  word32 u = 0;
459 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
460  for (i=0; i<2048; i+=cacheLineSize)
461 #else
462  for (i=0; i<1024; i+=cacheLineSize)
463 #endif
464  u &= *(const word32 *)(((const byte *)Td)+i);
465  u &= Td[255];
466  s0 |= u; s1 |= u; s2 |= u; s3 |= u;
467 
468  QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
469  QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
470  QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
471  QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
472 
473  // Nr - 2 full rounds:
474  unsigned int r = m_rounds/2 - 1;
475  do
476  {
477  s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
478 
479  QUARTER_ROUND_D(t3, s2, s1, s0, s3)
480  QUARTER_ROUND_D(t2, s1, s0, s3, s2)
481  QUARTER_ROUND_D(t1, s0, s3, s2, s1)
482  QUARTER_ROUND_D(t0, s3, s2, s1, s0)
483 
484  t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
485 
486  QUARTER_ROUND_D(s3, t2, t1, t0, t3)
487  QUARTER_ROUND_D(s2, t1, t0, t3, t2)
488  QUARTER_ROUND_D(s1, t0, t3, t2, t1)
489  QUARTER_ROUND_D(s0, t3, t2, t1, t0)
490 
491  rk += 8;
492  } while (--r);
493 
494 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
495  // timing attack countermeasure. see comments at top for more details
496  // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined,
497  // QUARTER_ROUND_LD will use Td, which is already preloaded.
498  u = 0;
499  for (i=0; i<256; i+=cacheLineSize)
500  u &= *(const word32 *)(Sd+i);
501  u &= *(const word32 *)(Sd+252);
502  t0 |= u; t1 |= u; t2 |= u; t3 |= u;
503 #endif
504 
505  word32 tbw[4];
506  byte *const tempBlock = (byte *)tbw;
507 
508  QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
509  QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
510  QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
511  QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
512 
513  Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
514 }
515 
516 // ************************* Assembly Code ************************************
517 
518 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
519 
520 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
521 
522 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
523 
524 CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k)
525 {
526 #if CRYPTOPP_BOOL_X86
527 
528 #define L_REG esp
529 #define L_INDEX(i) (L_REG+768+i)
530 #define L_INXORBLOCKS L_INBLOCKS+4
531 #define L_OUTXORBLOCKS L_INBLOCKS+8
532 #define L_OUTBLOCKS L_INBLOCKS+12
533 #define L_INCREMENTS L_INDEX(16*15)
534 #define L_SP L_INDEX(16*16)
535 #define L_LENGTH L_INDEX(16*16+4)
536 #define L_KEYS_BEGIN L_INDEX(16*16+8)
537 
538 #define MOVD movd
539 #define MM(i) mm##i
540 
541 #define MXOR(a,b,c) \
542  AS2( movzx esi, b)\
543  AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
544  AS2( pxor MM(a), mm7)\
545 
546 #define MMOV(a,b,c) \
547  AS2( movzx esi, b)\
548  AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
549 
550 #else
551 
552 #define L_REG r8
553 #define L_INDEX(i) (L_REG+i)
554 #define L_INXORBLOCKS L_INBLOCKS+8
555 #define L_OUTXORBLOCKS L_INBLOCKS+16
556 #define L_OUTBLOCKS L_INBLOCKS+24
557 #define L_INCREMENTS L_INDEX(16*16)
558 #define L_LENGTH L_INDEX(16*18+8)
559 #define L_KEYS_BEGIN L_INDEX(16*19)
560 
561 #define MOVD mov
562 #define MM_0 r9d
563 #define MM_1 r12d
564 #ifdef __GNUC__
565 #define MM_2 r11d
566 #else
567 #define MM_2 r10d
568 #endif
569 #define MM(i) MM_##i
570 
571 #define MXOR(a,b,c) \
572  AS2( movzx esi, b)\
573  AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
574 
575 #define MMOV(a,b,c) \
576  AS2( movzx esi, b)\
577  AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
578 
579 #endif
580 
581 #define L_SUBKEYS L_INDEX(0)
582 #define L_SAVED_X L_SUBKEYS
583 #define L_KEY12 L_INDEX(16*12)
584 #define L_LASTROUND L_INDEX(16*13)
585 #define L_INBLOCKS L_INDEX(16*14)
586 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
587 
588 #define XOR(a,b,c) \
589  AS2( movzx esi, b)\
590  AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
591 
592 #define MOV(a,b,c) \
593  AS2( movzx esi, b)\
594  AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
595 
596 #ifdef CRYPTOPP_GENERATE_X64_MASM
597  ALIGN 8
598  Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
599  rex_push_reg rsi
600  push_reg rdi
601  push_reg rbx
602  push_reg r12
603  .endprolog
604  mov L_REG, rcx
605  mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
606  mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
607 #elif defined(__GNUC__)
608  __asm__ __volatile__
609  (
610  ".intel_syntax noprefix;"
611  #if CRYPTOPP_BOOL_X64
612  AS2( mov L_REG, rcx)
613  #endif
614  AS_PUSH_IF86(bx)
615  AS_PUSH_IF86(bp)
616  AS2( mov AS_REG_7, WORD_REG(si))
617 #else
618  AS_PUSH_IF86(si)
619  AS_PUSH_IF86(di)
620  AS_PUSH_IF86(bx)
621  AS_PUSH_IF86(bp)
622  AS2( lea AS_REG_7, [Te])
623  AS2( mov edi, [g_cacheLineSize])
624 #endif
625 
626 #if CRYPTOPP_BOOL_X86
627  AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP
628  AS2( lea esp, [ecx-768])
629 #endif
630 
631  // copy subkeys to stack
632  AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
633  AS2( mov WORD_REG(ax), 16)
634  AS2( and WORD_REG(ax), WORD_REG(si))
635  AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter)
636  AS2( movdqa [L_KEY12], xmm3)
637  AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
638  AS2( sub WORD_REG(ax), WORD_REG(si))
639  ASL(0)
640  AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
641  AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
642  AS2( add WORD_REG(si), 16)
643  AS2( cmp WORD_REG(si), 16*12)
644  ASJ( jl, 0, b)
645 
646  // read subkeys 0, 1 and last
647  AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey
648  AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0
649  AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3
650  AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7
651  AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11
652  AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15
653 
654  // load table into cache
655  AS2( xor WORD_REG(ax), WORD_REG(ax))
656  ASL(9)
657  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
658  AS2( add WORD_REG(ax), WORD_REG(di))
659  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
660  AS2( add WORD_REG(ax), WORD_REG(di))
661  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
662  AS2( add WORD_REG(ax), WORD_REG(di))
663  AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
664  AS2( add WORD_REG(ax), WORD_REG(di))
665  AS2( cmp WORD_REG(ax), 2048)
666  ASJ( jl, 9, b)
667  AS1( lfence)
668 
669  AS2( test DWORD PTR [L_LENGTH], 1)
670  ASJ( jz, 8, f)
671 
672  // counter mode one-time setup
673  AS2( mov WORD_REG(si), [L_INBLOCKS])
674  AS2( movdqu xmm2, [WORD_REG(si)]) // counter
675  AS2( pxor xmm2, xmm1)
676  AS2( psrldq xmm1, 14)
677  AS2( movd eax, xmm1)
678  AS2( mov al, BYTE PTR [WORD_REG(si)+15])
679  AS2( MOVD MM(2), eax)
680 #if CRYPTOPP_BOOL_X86
681  AS2( mov eax, 1)
682  AS2( movd mm3, eax)
683 #endif
684 
685  // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx
686  AS2( movd eax, xmm2)
687  AS2( psrldq xmm2, 4)
688  AS2( movd edi, xmm2)
689  AS2( psrldq xmm2, 4)
690  MXOR( 1, al, 0) // 0
691  XOR( edx, ah, 1) // 1
692  AS2( shr eax, 16)
693  XOR( ecx, al, 2) // 2
694  XOR( ebx, ah, 3) // 3
695  AS2( mov eax, edi)
696  AS2( movd edi, xmm2)
697  AS2( psrldq xmm2, 4)
698  XOR( ebx, al, 0) // 4
699  MXOR( 1, ah, 1) // 5
700  AS2( shr eax, 16)
701  XOR( edx, al, 2) // 6
702  XOR( ecx, ah, 3) // 7
703  AS2( mov eax, edi)
704  AS2( movd edi, xmm2)
705  XOR( ecx, al, 0) // 8
706  XOR( ebx, ah, 1) // 9
707  AS2( shr eax, 16)
708  MXOR( 1, al, 2) // 10
709  XOR( edx, ah, 3) // 11
710  AS2( mov eax, edi)
711  XOR( edx, al, 0) // 12
712  XOR( ecx, ah, 1) // 13
713  AS2( shr eax, 16)
714  XOR( ebx, al, 2) // 14
715  AS2( psrldq xmm2, 3)
716 
717  // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0
718  AS2( mov eax, [L_KEY12+0*4])
719  AS2( mov edi, [L_KEY12+2*4])
720  AS2( MOVD MM(0), [L_KEY12+3*4])
721  MXOR( 0, cl, 3) /* 11 */
722  XOR( edi, bl, 3) /* 7 */
723  MXOR( 0, bh, 2) /* 6 */
724  AS2( shr ebx, 16) /* 4,5 */
725  XOR( eax, bl, 1) /* 5 */
726  MOV( ebx, bh, 0) /* 4 */
727  AS2( xor ebx, [L_KEY12+1*4])
728  XOR( eax, ch, 2) /* 10 */
729  AS2( shr ecx, 16) /* 8,9 */
730  XOR( eax, dl, 3) /* 15 */
731  XOR( ebx, dh, 2) /* 14 */
732  AS2( shr edx, 16) /* 12,13 */
733  XOR( edi, ch, 0) /* 8 */
734  XOR( ebx, cl, 1) /* 9 */
735  XOR( edi, dl, 1) /* 13 */
736  MXOR( 0, dh, 0) /* 12 */
737 
738  AS2( movd ecx, xmm2)
739  AS2( MOVD edx, MM(1))
740  AS2( MOVD [L_SAVED_X+3*4], MM(0))
741  AS2( mov [L_SAVED_X+0*4], eax)
742  AS2( mov [L_SAVED_X+1*4], ebx)
743  AS2( mov [L_SAVED_X+2*4], edi)
744  ASJ( jmp, 5, f)
745 
746  ASL(3)
747  // non-counter mode per-block setup
748  AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3
749  AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7
750  AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11
751  AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15
752  ASL(8)
753  AS2( mov WORD_REG(ax), [L_INBLOCKS])
754  AS2( movdqu xmm2, [WORD_REG(ax)])
755  AS2( mov WORD_REG(si), [L_INXORBLOCKS])
756  AS2( movdqu xmm5, [WORD_REG(si)])
757  AS2( pxor xmm2, xmm1)
758  AS2( pxor xmm2, xmm5)
759 
760  // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx
761  AS2( movd eax, xmm2)
762  AS2( psrldq xmm2, 4)
763  AS2( movd edi, xmm2)
764  AS2( psrldq xmm2, 4)
765  MXOR( 1, al, 0) // 0
766  XOR( edx, ah, 1) // 1
767  AS2( shr eax, 16)
768  XOR( ecx, al, 2) // 2
769  XOR( ebx, ah, 3) // 3
770  AS2( mov eax, edi)
771  AS2( movd edi, xmm2)
772  AS2( psrldq xmm2, 4)
773  XOR( ebx, al, 0) // 4
774  MXOR( 1, ah, 1) // 5
775  AS2( shr eax, 16)
776  XOR( edx, al, 2) // 6
777  XOR( ecx, ah, 3) // 7
778  AS2( mov eax, edi)
779  AS2( movd edi, xmm2)
780  XOR( ecx, al, 0) // 8
781  XOR( ebx, ah, 1) // 9
782  AS2( shr eax, 16)
783  MXOR( 1, al, 2) // 10
784  XOR( edx, ah, 3) // 11
785  AS2( mov eax, edi)
786  XOR( edx, al, 0) // 12
787  XOR( ecx, ah, 1) // 13
788  AS2( shr eax, 16)
789  XOR( ebx, al, 2) // 14
790  MXOR( 1, ah, 3) // 15
791  AS2( MOVD eax, MM(1))
792 
793  AS2( add L_REG, [L_KEYS_BEGIN])
794  AS2( add L_REG, 4*16)
795  ASJ( jmp, 2, f)
796 
797  ASL(1)
798  // counter-mode per-block setup
799  AS2( MOVD ecx, MM(2))
800  AS2( MOVD edx, MM(1))
801  AS2( mov eax, [L_SAVED_X+0*4])
802  AS2( mov ebx, [L_SAVED_X+1*4])
803  AS2( xor cl, ch)
804  AS2( and WORD_REG(cx), 255)
805  ASL(5)
806 #if CRYPTOPP_BOOL_X86
807  AS2( paddb MM(2), mm3)
808 #else
809  AS2( add MM(2), 1)
810 #endif
811  // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx
812  AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
813  XOR( ebx, dl, 3)
814  MOV( ecx, dh, 2)
815  AS2( shr edx, 16)
816  AS2( xor ecx, [L_SAVED_X+2*4])
817  XOR( eax, dh, 0)
818  MOV( edx, dl, 1)
819  AS2( xor edx, [L_SAVED_X+3*4])
820 
821  AS2( add L_REG, [L_KEYS_BEGIN])
822  AS2( add L_REG, 3*16)
823  ASJ( jmp, 4, f)
824 
825 // in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15)
826 // out: eax, ebx, edi, mm0
827 #define ROUND() \
828  MXOR( 0, cl, 3) /* 11 */\
829  AS2( mov cl, al) /* 8,9,10,3 */\
830  XOR( edi, ah, 2) /* 2 */\
831  AS2( shr eax, 16) /* 0,1 */\
832  XOR( edi, bl, 3) /* 7 */\
833  MXOR( 0, bh, 2) /* 6 */\
834  AS2( shr ebx, 16) /* 4,5 */\
835  MXOR( 0, al, 1) /* 1 */\
836  MOV( eax, ah, 0) /* 0 */\
837  XOR( eax, bl, 1) /* 5 */\
838  MOV( ebx, bh, 0) /* 4 */\
839  XOR( eax, ch, 2) /* 10 */\
840  XOR( ebx, cl, 3) /* 3 */\
841  AS2( shr ecx, 16) /* 8,9 */\
842  XOR( eax, dl, 3) /* 15 */\
843  XOR( ebx, dh, 2) /* 14 */\
844  AS2( shr edx, 16) /* 12,13 */\
845  XOR( edi, ch, 0) /* 8 */\
846  XOR( ebx, cl, 1) /* 9 */\
847  XOR( edi, dl, 1) /* 13 */\
848  MXOR( 0, dh, 0) /* 12 */\
849 
850  ASL(2) // 2-round loop
851  AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
852  AS2( mov edi, [L_SUBKEYS-4*16+2*4])
853  ROUND()
854  AS2( mov ecx, edi)
855  AS2( xor eax, [L_SUBKEYS-4*16+0*4])
856  AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
857  AS2( MOVD edx, MM(0))
858 
859  ASL(4)
860  AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
861  AS2( mov edi, [L_SUBKEYS-4*16+6*4])
862  ROUND()
863  AS2( mov ecx, edi)
864  AS2( xor eax, [L_SUBKEYS-4*16+4*4])
865  AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
866  AS2( MOVD edx, MM(0))
867 
868  AS2( add L_REG, 32)
869  AS2( test L_REG, 255)
870  ASJ( jnz, 2, b)
871  AS2( sub L_REG, 16*16)
872 
873 #define LAST(a, b, c) \
874  AS2( movzx esi, a )\
875  AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
876  AS2( movzx esi, b )\
877  AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
878  AS2( mov WORD PTR [L_LASTROUND+c], di )\
879 
880  // last round
881  LAST(ch, dl, 2)
882  LAST(dh, al, 6)
883  AS2( shr edx, 16)
884  LAST(ah, bl, 10)
885  AS2( shr eax, 16)
886  LAST(bh, cl, 14)
887  AS2( shr ebx, 16)
888  LAST(dh, al, 12)
889  AS2( shr ecx, 16)
890  LAST(ah, bl, 0)
891  LAST(bh, cl, 4)
892  LAST(ch, dl, 8)
893 
894  AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
895  AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
896 
897  AS2( mov WORD_REG(cx), [L_LENGTH])
898  AS2( sub WORD_REG(cx), 16)
899 
900  AS2( movdqu xmm2, [WORD_REG(ax)])
901  AS2( pxor xmm2, xmm4)
902 
903 #if CRYPTOPP_BOOL_X86
904  AS2( movdqa xmm0, [L_INCREMENTS])
905  AS2( paddd xmm0, [L_INBLOCKS])
906  AS2( movdqa [L_INBLOCKS], xmm0)
907 #else
908  AS2( movdqa xmm0, [L_INCREMENTS+16])
909  AS2( paddq xmm0, [L_INBLOCKS+16])
910  AS2( movdqa [L_INBLOCKS+16], xmm0)
911 #endif
912 
913  AS2( pxor xmm2, [L_LASTROUND])
914  AS2( movdqu [WORD_REG(bx)], xmm2)
915 
916  ASJ( jle, 7, f)
917  AS2( mov [L_LENGTH], WORD_REG(cx))
918  AS2( test WORD_REG(cx), 1)
919  ASJ( jnz, 1, b)
920 #if CRYPTOPP_BOOL_X64
921  AS2( movdqa xmm0, [L_INCREMENTS])
922  AS2( paddq xmm0, [L_INBLOCKS])
923  AS2( movdqa [L_INBLOCKS], xmm0)
924 #endif
925  ASJ( jmp, 3, b)
926 
927  ASL(7)
928  // erase keys on stack
929  AS2( xorps xmm0, xmm0)
930  AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
931  AS2( movaps [WORD_REG(ax)-7*16], xmm0)
932  AS2( movaps [WORD_REG(ax)-6*16], xmm0)
933  AS2( movaps [WORD_REG(ax)-5*16], xmm0)
934  AS2( movaps [WORD_REG(ax)-4*16], xmm0)
935  AS2( movaps [WORD_REG(ax)-3*16], xmm0)
936  AS2( movaps [WORD_REG(ax)-2*16], xmm0)
937  AS2( movaps [WORD_REG(ax)-1*16], xmm0)
938  AS2( movaps [WORD_REG(ax)+0*16], xmm0)
939  AS2( movaps [WORD_REG(ax)+1*16], xmm0)
940  AS2( movaps [WORD_REG(ax)+2*16], xmm0)
941  AS2( movaps [WORD_REG(ax)+3*16], xmm0)
942  AS2( movaps [WORD_REG(ax)+4*16], xmm0)
943  AS2( movaps [WORD_REG(ax)+5*16], xmm0)
944  AS2( movaps [WORD_REG(ax)+6*16], xmm0)
945 #if CRYPTOPP_BOOL_X86
946  AS2( mov esp, [L_SP])
947  AS1( emms)
948 #endif
949  AS_POP_IF86(bp)
950  AS_POP_IF86(bx)
951 #if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
952  AS_POP_IF86(di)
953  AS_POP_IF86(si)
954  AS1(ret)
955 #endif
956 #ifdef CRYPTOPP_GENERATE_X64_MASM
957  pop r12
958  pop rbx
959  pop rdi
960  pop rsi
961  ret
962  Rijndael_Enc_AdvancedProcessBlocks ENDP
963 #endif
964 #ifdef __GNUC__
965  ".att_syntax prefix;"
966  :
967  : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize)
968  : "memory", "cc", "%eax"
969  #if CRYPTOPP_BOOL_X64
970  , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12"
971  #endif
972  );
973 #endif
974 }
975 
976 #endif
977 
978 #ifndef CRYPTOPP_GENERATE_X64_MASM
979 
980 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
981 extern "C" {
982 void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k);
983 }
984 #endif
985 
986 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86
987 
988 static inline bool AliasedWithTable(const byte *begin, const byte *end)
989 {
990  size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
991  size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096;
992  if (t1 > t0)
993  return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
994  else
995  return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
996 }
997 
998 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
999 
1000 inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1001 {
1002  block = _mm_xor_si128(block, subkeys[0]);
1003  for (unsigned int i=1; i<rounds-1; i+=2)
1004  {
1005  block = _mm_aesenc_si128(block, subkeys[i]);
1006  block = _mm_aesenc_si128(block, subkeys[i+1]);
1007  }
1008  block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1009  block = _mm_aesenclast_si128(block, subkeys[rounds]);
1010 }
1011 
1012 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1013 {
1014  __m128i rk = subkeys[0];
1015  block0 = _mm_xor_si128(block0, rk);
1016  block1 = _mm_xor_si128(block1, rk);
1017  block2 = _mm_xor_si128(block2, rk);
1018  block3 = _mm_xor_si128(block3, rk);
1019  for (unsigned int i=1; i<rounds; i++)
1020  {
1021  rk = subkeys[i];
1022  block0 = _mm_aesenc_si128(block0, rk);
1023  block1 = _mm_aesenc_si128(block1, rk);
1024  block2 = _mm_aesenc_si128(block2, rk);
1025  block3 = _mm_aesenc_si128(block3, rk);
1026  }
1027  rk = subkeys[rounds];
1028  block0 = _mm_aesenclast_si128(block0, rk);
1029  block1 = _mm_aesenclast_si128(block1, rk);
1030  block2 = _mm_aesenclast_si128(block2, rk);
1031  block3 = _mm_aesenclast_si128(block3, rk);
1032 }
1033 
1034 inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds)
1035 {
1036  block = _mm_xor_si128(block, subkeys[0]);
1037  for (unsigned int i=1; i<rounds-1; i+=2)
1038  {
1039  block = _mm_aesdec_si128(block, subkeys[i]);
1040  block = _mm_aesdec_si128(block, subkeys[i+1]);
1041  }
1042  block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1043  block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1044 }
1045 
1046 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds)
1047 {
1048  __m128i rk = subkeys[0];
1049  block0 = _mm_xor_si128(block0, rk);
1050  block1 = _mm_xor_si128(block1, rk);
1051  block2 = _mm_xor_si128(block2, rk);
1052  block3 = _mm_xor_si128(block3, rk);
1053  for (unsigned int i=1; i<rounds; i++)
1054  {
1055  rk = subkeys[i];
1056  block0 = _mm_aesdec_si128(block0, rk);
1057  block1 = _mm_aesdec_si128(block1, rk);
1058  block2 = _mm_aesdec_si128(block2, rk);
1059  block3 = _mm_aesdec_si128(block3, rk);
1060  }
1061  rk = subkeys[rounds];
1062  block0 = _mm_aesdeclast_si128(block0, rk);
1063  block1 = _mm_aesdeclast_si128(block1, rk);
1064  block2 = _mm_aesdeclast_si128(block2, rk);
1065  block3 = _mm_aesdeclast_si128(block3, rk);
1066 }
1067 
1068 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
1069 
1070 template <typename F1, typename F4>
1071 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1072 {
1073  size_t blockSize = 16;
1074  size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1075  size_t xorIncrement = xorBlocks ? blockSize : 0;
1076  size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1077 
1078  if (flags & BlockTransformation::BT_ReverseDirection)
1079  {
1080  assert(length % blockSize == 0);
1081  inBlocks += length - blockSize;
1082  xorBlocks += length - blockSize;
1083  outBlocks += length - blockSize;
1084  inIncrement = 0-inIncrement;
1085  xorIncrement = 0-xorIncrement;
1086  outIncrement = 0-outIncrement;
1087  }
1088 
1089  if (flags & BlockTransformation::BT_AllowParallel)
1090  {
1091  while (length >= 4*blockSize)
1092  {
1093  __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3;
1094  if (flags & BlockTransformation::BT_InBlockIsCounter)
1095  {
1096  const __m128i be1 = *(const __m128i *)s_one;
1097  block1 = _mm_add_epi32(block0, be1);
1098  block2 = _mm_add_epi32(block1, be1);
1099  block3 = _mm_add_epi32(block2, be1);
1100  _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
1101  }
1102  else
1103  {
1104  inBlocks += inIncrement;
1105  block1 = _mm_loadu_si128((const __m128i *)inBlocks);
1106  inBlocks += inIncrement;
1107  block2 = _mm_loadu_si128((const __m128i *)inBlocks);
1108  inBlocks += inIncrement;
1109  block3 = _mm_loadu_si128((const __m128i *)inBlocks);
1110  inBlocks += inIncrement;
1111  }
1112 
1113  if (flags & BlockTransformation::BT_XorInput)
1114  {
1115  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
1116  xorBlocks += xorIncrement;
1117  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
1118  xorBlocks += xorIncrement;
1119  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
1120  xorBlocks += xorIncrement;
1121  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
1122  xorBlocks += xorIncrement;
1123  }
1124 
1125  func4(block0, block1, block2, block3, subkeys, rounds);
1126 
1127  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1128  {
1129  block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks));
1130  xorBlocks += xorIncrement;
1131  block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks));
1132  xorBlocks += xorIncrement;
1133  block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks));
1134  xorBlocks += xorIncrement;
1135  block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks));
1136  xorBlocks += xorIncrement;
1137  }
1138 
1139  _mm_storeu_si128((__m128i *)outBlocks, block0);
1140  outBlocks += outIncrement;
1141  _mm_storeu_si128((__m128i *)outBlocks, block1);
1142  outBlocks += outIncrement;
1143  _mm_storeu_si128((__m128i *)outBlocks, block2);
1144  outBlocks += outIncrement;
1145  _mm_storeu_si128((__m128i *)outBlocks, block3);
1146  outBlocks += outIncrement;
1147 
1148  length -= 4*blockSize;
1149  }
1150  }
1151 
1152  while (length >= blockSize)
1153  {
1154  __m128i block = _mm_loadu_si128((const __m128i *)inBlocks);
1155 
1156  if (flags & BlockTransformation::BT_XorInput)
1157  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
1158 
1159  if (flags & BlockTransformation::BT_InBlockIsCounter)
1160  const_cast<byte *>(inBlocks)[15]++;
1161 
1162  func1(block, subkeys, rounds);
1163 
1164  if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1165  block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks));
1166 
1167  _mm_storeu_si128((__m128i *)outBlocks, block);
1168 
1169  inBlocks += inIncrement;
1170  outBlocks += outIncrement;
1171  xorBlocks += xorIncrement;
1172  length -= blockSize;
1173  }
1174 
1175  return length;
1176 }
1177 #endif
1178 
1179 size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1180 {
1181 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1182  if (HasAESNI())
1183  return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1184 #endif
1185 
1186 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
1187  if (HasSSE2())
1188  {
1189  if (length < BLOCKSIZE)
1190  return length;
1191 
1192  struct Locals
1193  {
1194  word32 subkeys[4*12], workspace[8];
1195  const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1196  byte *outBlocks;
1197  size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1198  size_t regSpill, lengthAndCounterFlag, keysBegin;
1199  };
1200 
1201  size_t increment = BLOCKSIZE;
1202  const byte* zeros = (byte *)(Te+256);
1203  byte *space;
1204 
1205  do {
1206  space = (byte *)alloca(255+sizeof(Locals));
1207  space += (256-(size_t)space%256)%256;
1208  }
1209  while (AliasedWithTable(space, space+sizeof(Locals)));
1210 
1211  if (flags & BT_ReverseDirection)
1212  {
1213  assert(length % BLOCKSIZE == 0);
1214  inBlocks += length - BLOCKSIZE;
1215  xorBlocks += length - BLOCKSIZE;
1216  outBlocks += length - BLOCKSIZE;
1217  increment = 0-increment;
1218  }
1219 
1220  Locals &locals = *(Locals *)space;
1221 
1222  locals.inBlocks = inBlocks;
1223  locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1224  locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1225  locals.outBlocks = outBlocks;
1226 
1227  locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1228  locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1229  locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1230  locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1231 
1232  locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter);
1233  int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1234  locals.keysBegin = (12-keysToCopy)*16;
1235 
1236  Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1237  return length % BLOCKSIZE;
1238  }
1239 #endif
1240 
1241  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1242 }
1243 
1244 #endif
1245 
1246 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1247 
1248 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
1249 {
1250  if (HasAESNI())
1251  return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1252 
1253  return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
1254 }
1255 
1256 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1257 
1258 NAMESPACE_END
1259 
1260 #endif
1261 #endif
virtual size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
encrypt and xor blocks according to flags (see FlagsForAdvancedProcessBlocks)
Definition: cryptlib.cpp:141
Rijndael
Definition: rijndael.h:19
interface for retrieving values given their names
Definition: cryptlib.h:225