Crypto++
vmac.cpp
1 // vmac.cpp - written and placed in the public domain by Wei Dai
2 // based on Ted Krovetz's public domain vmac.c and draft-krovetz-vmac-01.txt
3 
4 #include "pch.h"
5 #include "vmac.h"
6 #include "argnames.h"
7 #include "cpu.h"
8 
9 NAMESPACE_BEGIN(CryptoPP)
10 
11 #if defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
12 #include <intrin.h>
13 #endif
14 
15 #define VMAC_BOOL_WORD128 (defined(CRYPTOPP_WORD128_AVAILABLE) && !defined(CRYPTOPP_X64_ASM_AVAILABLE))
16 #ifdef __BORLANDC__
17 #define const // Turbo C++ 2006 workaround
18 #endif
19 static const word64 p64 = W64LIT(0xfffffffffffffeff); /* 2^64 - 257 prime */
20 static const word64 m62 = W64LIT(0x3fffffffffffffff); /* 62-bit mask */
21 static const word64 m63 = W64LIT(0x7fffffffffffffff); /* 63-bit mask */
22 static const word64 m64 = W64LIT(0xffffffffffffffff); /* 64-bit mask */
23 static const word64 mpoly = W64LIT(0x1fffffff1fffffff); /* Poly key mask */
24 #ifdef __BORLANDC__
25 #undef const
26 #endif
27 #if VMAC_BOOL_WORD128
28 #ifdef __powerpc__
29 // workaround GCC Bug 31690: ICE with const __uint128_t and C++ front-end
30 #define m126 ((word128(m62)<<64)|m64)
31 #else
32 static const word128 m126 = (word128(m62)<<64)|m64; /* 126-bit mask */
33 #endif
34 #endif
35 
36 void VMAC_Base::UncheckedSetKey(const byte *userKey, unsigned int keylength, const NameValuePairs &params)
37 {
38  int digestLength = params.GetIntValueWithDefault(Name::DigestSize(), DefaultDigestSize());
39  if (digestLength != 8 && digestLength != 16)
40  throw InvalidArgument("VMAC: DigestSize must be 8 or 16");
41  m_is128 = digestLength == 16;
42 
43  m_L1KeyLength = params.GetIntValueWithDefault(Name::L1KeyLength(), 128);
44  if (m_L1KeyLength <= 0 || m_L1KeyLength % 128 != 0)
45  throw InvalidArgument("VMAC: L1KeyLength must be a positive multiple of 128");
46 
47  AllocateBlocks();
48 
49  BlockCipher &cipher = AccessCipher();
50  cipher.SetKey(userKey, keylength, params);
51  unsigned int blockSize = cipher.BlockSize();
52  unsigned int blockSizeInWords = blockSize / sizeof(word64);
53  SecBlock<word64> out(blockSizeInWords);
54  SecByteBlock in;
55  in.CleanNew(blockSize);
56  size_t i;
57 
58  /* Fill nh key */
59  in[0] = 0x80;
60  cipher.AdvancedProcessBlocks(in, NULL, (byte *)m_nhKey(), m_nhKeySize()*sizeof(word64), cipher.BT_InBlockIsCounter);
61  ConditionalByteReverse<word64>(BIG_ENDIAN_ORDER, m_nhKey(), m_nhKey(), m_nhKeySize()*sizeof(word64));
62 
63  /* Fill poly key */
64  in[0] = 0xC0;
65  in[15] = 0;
66  for (i = 0; i <= (size_t)m_is128; i++)
67  {
68  cipher.ProcessBlock(in, out.BytePtr());
69  m_polyState()[i*4+2] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()) & mpoly;
70  m_polyState()[i*4+3] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8) & mpoly;
71  in[15]++;
72  }
73 
74  /* Fill ip key */
75  in[0] = 0xE0;
76  in[15] = 0;
77  word64 *l3Key = m_l3Key();
78  for (i = 0; i <= (size_t)m_is128; i++)
79  do
80  {
81  cipher.ProcessBlock(in, out.BytePtr());
82  l3Key[i*2+0] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr());
83  l3Key[i*2+1] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8);
84  in[15]++;
85  } while ((l3Key[i*2+0] >= p64) || (l3Key[i*2+1] >= p64));
86 
87  m_padCached = false;
88  size_t nonceLength;
89  const byte *nonce = GetIVAndThrowIfInvalid(params, nonceLength);
90  Resynchronize(nonce, (int)nonceLength);
91 }
92 
94 {
96  IV[0] &= 0x7f;
97 }
98 
99 void VMAC_Base::Resynchronize(const byte *nonce, int len)
100 {
101  size_t length = ThrowIfInvalidIVLength(len);
102  size_t s = IVSize();
103  byte *storedNonce = m_nonce();
104 
105  if (m_is128)
106  {
107  memset(storedNonce, 0, s-length);
108  memcpy(storedNonce+s-length, nonce, length);
109  AccessCipher().ProcessBlock(storedNonce, m_pad());
110  }
111  else
112  {
113  if (m_padCached && (storedNonce[s-1] | 1) == (nonce[length-1] | 1))
114  {
115  m_padCached = VerifyBufsEqual(storedNonce+s-length, nonce, length-1);
116  for (size_t i=0; m_padCached && i<s-length; i++)
117  m_padCached = (storedNonce[i] == 0);
118  }
119  if (!m_padCached)
120  {
121  memset(storedNonce, 0, s-length);
122  memcpy(storedNonce+s-length, nonce, length-1);
123  storedNonce[s-1] = nonce[length-1] & 0xfe;
124  AccessCipher().ProcessBlock(storedNonce, m_pad());
125  m_padCached = true;
126  }
127  storedNonce[s-1] = nonce[length-1];
128  }
129  m_isFirstBlock = true;
130  Restart();
131 }
132 
133 void VMAC_Base::HashEndianCorrectedBlock(const word64 *data)
134 {
135  assert(false);
136  throw 0;
137 }
138 
139 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
140 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
141 void
142 #ifdef __GNUC__
143 __attribute__ ((noinline)) // Intel Compiler 9.1 workaround
144 #endif
145 VMAC_Base::VHASH_Update_SSE2(const word64 *data, size_t blocksRemainingInWord64, int tagPart)
146 {
147  const word64 *nhK = m_nhKey();
148  word64 *polyS = m_polyState();
149  word32 L1KeyLength = m_L1KeyLength;
150 
151 #ifdef __GNUC__
152  word32 temp;
153  __asm__ __volatile__
154  (
155  AS2( mov %%ebx, %0)
156  AS2( mov %1, %%ebx)
157  ".intel_syntax noprefix;"
158 #else
159  #if _MSC_VER < 1300 || defined(__INTEL_COMPILER)
160  char isFirstBlock = m_isFirstBlock;
161  AS2( mov ebx, [L1KeyLength])
162  AS2( mov dl, [isFirstBlock])
163  #else
164  AS2( mov ecx, this)
165  AS2( mov ebx, [ecx+m_L1KeyLength])
166  AS2( mov dl, [ecx+m_isFirstBlock])
167  #endif
168  AS2( mov eax, tagPart)
169  AS2( shl eax, 4)
170  AS2( mov edi, nhK)
171  AS2( add edi, eax)
172  AS2( add eax, eax)
173  AS2( add eax, polyS)
174 
175  AS2( mov esi, data)
176  AS2( mov ecx, blocksRemainingInWord64)
177 #endif
178 
179  AS2( shr ebx, 3)
180  AS1( push ebp)
181  AS2( sub esp, 12)
182  ASL(4)
183  AS2( mov ebp, ebx)
184  AS2( cmp ecx, ebx)
185  AS2( cmovl ebp, ecx)
186  AS2( sub ecx, ebp)
187  AS2( lea ebp, [edi+8*ebp]) // end of nhK
188  AS2( movq mm6, [esi])
189  AS2( paddq mm6, [edi])
190  AS2( movq mm5, [esi+8])
191  AS2( paddq mm5, [edi+8])
192  AS2( add esi, 16)
193  AS2( add edi, 16)
194  AS2( movq mm4, mm6)
195  ASS( pshufw mm2, mm6, 1, 0, 3, 2)
196  AS2( pmuludq mm6, mm5)
197  ASS( pshufw mm3, mm5, 1, 0, 3, 2)
198  AS2( pmuludq mm5, mm2)
199  AS2( pmuludq mm2, mm3)
200  AS2( pmuludq mm3, mm4)
201  AS2( pxor mm7, mm7)
202  AS2( movd [esp], mm6)
203  AS2( psrlq mm6, 32)
204  AS2( movd [esp+4], mm5)
205  AS2( psrlq mm5, 32)
206  AS2( cmp edi, ebp)
207  ASJ( je, 1, f)
208  ASL(0)
209  AS2( movq mm0, [esi])
210  AS2( paddq mm0, [edi])
211  AS2( movq mm1, [esi+8])
212  AS2( paddq mm1, [edi+8])
213  AS2( add esi, 16)
214  AS2( add edi, 16)
215  AS2( movq mm4, mm0)
216  AS2( paddq mm5, mm2)
217  ASS( pshufw mm2, mm0, 1, 0, 3, 2)
218  AS2( pmuludq mm0, mm1)
219  AS2( movd [esp+8], mm3)
220  AS2( psrlq mm3, 32)
221  AS2( paddq mm5, mm3)
222  ASS( pshufw mm3, mm1, 1, 0, 3, 2)
223  AS2( pmuludq mm1, mm2)
224  AS2( pmuludq mm2, mm3)
225  AS2( pmuludq mm3, mm4)
226  AS2( movd mm4, [esp])
227  AS2( paddq mm7, mm4)
228  AS2( movd mm4, [esp+4])
229  AS2( paddq mm6, mm4)
230  AS2( movd mm4, [esp+8])
231  AS2( paddq mm6, mm4)
232  AS2( movd [esp], mm0)
233  AS2( psrlq mm0, 32)
234  AS2( paddq mm6, mm0)
235  AS2( movd [esp+4], mm1)
236  AS2( psrlq mm1, 32)
237  AS2( paddq mm5, mm1)
238  AS2( cmp edi, ebp)
239  ASJ( jne, 0, b)
240  ASL(1)
241  AS2( paddq mm5, mm2)
242  AS2( movd [esp+8], mm3)
243  AS2( psrlq mm3, 32)
244  AS2( paddq mm5, mm3)
245  AS2( movd mm4, [esp])
246  AS2( paddq mm7, mm4)
247  AS2( movd mm4, [esp+4])
248  AS2( paddq mm6, mm4)
249  AS2( movd mm4, [esp+8])
250  AS2( paddq mm6, mm4)
251  AS2( lea ebp, [8*ebx])
252  AS2( sub edi, ebp) // reset edi to start of nhK
253 
254  AS2( movd [esp], mm7)
255  AS2( psrlq mm7, 32)
256  AS2( paddq mm6, mm7)
257  AS2( movd [esp+4], mm6)
258  AS2( psrlq mm6, 32)
259  AS2( paddq mm5, mm6)
260  AS2( psllq mm5, 2)
261  AS2( psrlq mm5, 2)
262 
263 #define a0 [eax+2*4]
264 #define a1 [eax+3*4]
265 #define a2 [eax+0*4]
266 #define a3 [eax+1*4]
267 #define k0 [eax+2*8+2*4]
268 #define k1 [eax+2*8+3*4]
269 #define k2 [eax+2*8+0*4]
270 #define k3 [eax+2*8+1*4]
271  AS2( test dl, dl)
272  ASJ( jz, 2, f)
273  AS2( movd mm1, k0)
274  AS2( movd mm0, [esp])
275  AS2( paddq mm0, mm1)
276  AS2( movd a0, mm0)
277  AS2( psrlq mm0, 32)
278  AS2( movd mm1, k1)
279  AS2( movd mm2, [esp+4])
280  AS2( paddq mm1, mm2)
281  AS2( paddq mm0, mm1)
282  AS2( movd a1, mm0)
283  AS2( psrlq mm0, 32)
284  AS2( paddq mm5, k2)
285  AS2( paddq mm0, mm5)
286  AS2( movq a2, mm0)
287  AS2( xor edx, edx)
288  ASJ( jmp, 3, f)
289  ASL(2)
290  AS2( movd mm0, a3)
291  AS2( movq mm4, mm0)
292  AS2( pmuludq mm0, k3) // a3*k3
293  AS2( movd mm1, a0)
294  AS2( pmuludq mm1, k2) // a0*k2
295  AS2( movd mm2, a1)
296  AS2( movd mm6, k1)
297  AS2( pmuludq mm2, mm6) // a1*k1
298  AS2( movd mm3, a2)
299  AS2( psllq mm0, 1)
300  AS2( paddq mm0, mm5)
301  AS2( movq mm5, mm3)
302  AS2( movd mm7, k0)
303  AS2( pmuludq mm3, mm7) // a2*k0
304  AS2( pmuludq mm4, mm7) // a3*k0
305  AS2( pmuludq mm5, mm6) // a2*k1
306  AS2( paddq mm0, mm1)
307  AS2( movd mm1, a1)
308  AS2( paddq mm4, mm5)
309  AS2( movq mm5, mm1)
310  AS2( pmuludq mm1, k2) // a1*k2
311  AS2( paddq mm0, mm2)
312  AS2( movd mm2, a0)
313  AS2( paddq mm0, mm3)
314  AS2( movq mm3, mm2)
315  AS2( pmuludq mm2, k3) // a0*k3
316  AS2( pmuludq mm3, mm7) // a0*k0
317  AS2( movd [esp+8], mm0)
318  AS2( psrlq mm0, 32)
319  AS2( pmuludq mm7, mm5) // a1*k0
320  AS2( pmuludq mm5, k3) // a1*k3
321  AS2( paddq mm0, mm1)
322  AS2( movd mm1, a2)
323  AS2( pmuludq mm1, k2) // a2*k2
324  AS2( paddq mm0, mm2)
325  AS2( paddq mm0, mm4)
326  AS2( movq mm4, mm0)
327  AS2( movd mm2, a3)
328  AS2( pmuludq mm2, mm6) // a3*k1
329  AS2( pmuludq mm6, a0) // a0*k1
330  AS2( psrlq mm0, 31)
331  AS2( paddq mm0, mm3)
332  AS2( movd mm3, [esp])
333  AS2( paddq mm0, mm3)
334  AS2( movd mm3, a2)
335  AS2( pmuludq mm3, k3) // a2*k3
336  AS2( paddq mm5, mm1)
337  AS2( movd mm1, a3)
338  AS2( pmuludq mm1, k2) // a3*k2
339  AS2( paddq mm5, mm2)
340  AS2( movd mm2, [esp+4])
341  AS2( psllq mm5, 1)
342  AS2( paddq mm0, mm5)
343  AS2( psllq mm4, 33)
344  AS2( movd a0, mm0)
345  AS2( psrlq mm0, 32)
346  AS2( paddq mm6, mm7)
347  AS2( movd mm7, [esp+8])
348  AS2( paddq mm0, mm6)
349  AS2( paddq mm0, mm2)
350  AS2( paddq mm3, mm1)
351  AS2( psllq mm3, 1)
352  AS2( paddq mm0, mm3)
353  AS2( psrlq mm4, 1)
354  AS2( movd a1, mm0)
355  AS2( psrlq mm0, 32)
356  AS2( por mm4, mm7)
357  AS2( paddq mm0, mm4)
358  AS2( movq a2, mm0)
359 #undef a0
360 #undef a1
361 #undef a2
362 #undef a3
363 #undef k0
364 #undef k1
365 #undef k2
366 #undef k3
367 
368  ASL(3)
369  AS2( test ecx, ecx)
370  ASJ( jnz, 4, b)
371 
372  AS2( add esp, 12)
373  AS1( pop ebp)
374  AS1( emms)
375 #ifdef __GNUC__
376  ".att_syntax prefix;"
377  AS2( mov %0, %%ebx)
378  : "=m" (temp)
379  : "m" (L1KeyLength), "c" (blocksRemainingInWord64), "S" (data), "D" (nhK+tagPart*2), "d" (m_isFirstBlock), "a" (polyS+tagPart*4)
380  : "memory", "cc"
381  );
382 #endif
383 }
384 #endif
385 
386 #if VMAC_BOOL_WORD128
387  #define DeclareNH(a) word128 a=0
388  #define MUL64(rh,rl,i1,i2) {word128 p = word128(i1)*(i2); rh = word64(p>>64); rl = word64(p);}
389  #define AccumulateNH(a, b, c) a += word128(b)*(c)
390  #define Multiply128(r, i1, i2) r = word128(word64(i1)) * word64(i2)
391 #else
392  #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
393  #define MUL32(a, b) __emulu(word32(a), word32(b))
394  #else
395  #define MUL32(a, b) ((word64)((word32)(a)) * (word32)(b))
396  #endif
397  #if defined(CRYPTOPP_X64_ASM_AVAILABLE)
398  #define DeclareNH(a) word64 a##0=0, a##1=0
399  #define MUL64(rh,rl,i1,i2) asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "g"(i2) : "cc");
400  #define AccumulateNH(a, b, c) asm ("mulq %3; addq %%rax, %0; adcq %%rdx, %1" : "+r"(a##0), "+r"(a##1) : "a"(b), "g"(c) : "%rdx", "cc");
401  #define ADD128(rh,rl,ih,il) asm ("addq %3, %1; adcq %2, %0" : "+r"(rh),"+r"(rl) : "r"(ih),"r"(il) : "cc");
402  #elif defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
403  #define DeclareNH(a) word64 a##0=0, a##1=0
404  #define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh));
405  #define AccumulateNH(a, b, c) {\
406  word64 ph, pl;\
407  pl = _umul128(b,c,&ph);\
408  a##0 += pl;\
409  a##1 += ph + (a##0 < pl);}
410  #else
411  #define VMAC_BOOL_32BIT 1
412  #define DeclareNH(a) word64 a##0=0, a##1=0, a##2=0
413  #define MUL64(rh,rl,i1,i2) \
414  { word64 _i1 = (i1), _i2 = (i2); \
415  word64 m1= MUL32(_i1,_i2>>32); \
416  word64 m2= MUL32(_i1>>32,_i2); \
417  rh = MUL32(_i1>>32,_i2>>32); \
418  rl = MUL32(_i1,_i2); \
419  ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \
420  ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \
421  }
422  #define AccumulateNH(a, b, c) {\
423  word64 p = MUL32(b, c);\
424  a##1 += word32((p)>>32);\
425  a##0 += word32(p);\
426  p = MUL32((b)>>32, c);\
427  a##2 += word32((p)>>32);\
428  a##1 += word32(p);\
429  p = MUL32((b)>>32, (c)>>32);\
430  a##2 += p;\
431  p = MUL32(b, (c)>>32);\
432  a##1 += word32(p);\
433  a##2 += word32(p>>32);}
434  #endif
435 #endif
436 #ifndef VMAC_BOOL_32BIT
437  #define VMAC_BOOL_32BIT 0
438 #endif
439 #ifndef ADD128
440  #define ADD128(rh,rl,ih,il) \
441  { word64 _il = (il); \
442  (rl) += (_il); \
443  (rh) += (ih) + ((rl) < (_il)); \
444  }
445 #endif
446 
447 #if !(defined(_MSC_VER) && _MSC_VER < 1300)
448 template <bool T_128BitTag>
449 #endif
450 void VMAC_Base::VHASH_Update_Template(const word64 *data, size_t blocksRemainingInWord64)
451 {
452  #define INNER_LOOP_ITERATION(j) {\
453  word64 d0 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+0]);\
454  word64 d1 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+1]);\
455  AccumulateNH(nhA, d0+nhK[i+2*j+0], d1+nhK[i+2*j+1]);\
456  if (T_128BitTag)\
457  AccumulateNH(nhB, d0+nhK[i+2*j+2], d1+nhK[i+2*j+3]);\
458  }
459 
460 #if (defined(_MSC_VER) && _MSC_VER < 1300)
461  bool T_128BitTag = m_is128;
462 #endif
463  size_t L1KeyLengthInWord64 = m_L1KeyLength / 8;
464  size_t innerLoopEnd = L1KeyLengthInWord64;
465  const word64 *nhK = m_nhKey();
466  word64 *polyS = m_polyState();
467  bool isFirstBlock = true;
468  size_t i;
469 
470  #if !VMAC_BOOL_32BIT
471  #if VMAC_BOOL_WORD128
472  word128 a1, a2;
473  #else
474  word64 ah1, al1, ah2, al2;
475  #endif
476  word64 kh1, kl1, kh2, kl2;
477  kh1=(polyS+0*4+2)[0]; kl1=(polyS+0*4+2)[1];
478  if (T_128BitTag)
479  {
480  kh2=(polyS+1*4+2)[0]; kl2=(polyS+1*4+2)[1];
481  }
482  #endif
483 
484  do
485  {
486  DeclareNH(nhA);
487  DeclareNH(nhB);
488 
489  i = 0;
490  if (blocksRemainingInWord64 < L1KeyLengthInWord64)
491  {
492  if (blocksRemainingInWord64 % 8)
493  {
494  innerLoopEnd = blocksRemainingInWord64 % 8;
495  for (; i<innerLoopEnd; i+=2)
496  INNER_LOOP_ITERATION(0);
497  }
498  innerLoopEnd = blocksRemainingInWord64;
499  }
500  for (; i<innerLoopEnd; i+=8)
501  {
502  INNER_LOOP_ITERATION(0);
503  INNER_LOOP_ITERATION(1);
504  INNER_LOOP_ITERATION(2);
505  INNER_LOOP_ITERATION(3);
506  }
507  blocksRemainingInWord64 -= innerLoopEnd;
508  data += innerLoopEnd;
509 
510  #if VMAC_BOOL_32BIT
511  word32 nh0[2], nh1[2];
512  word64 nh2[2];
513 
514  nh0[0] = word32(nhA0);
515  nhA1 += (nhA0 >> 32);
516  nh1[0] = word32(nhA1);
517  nh2[0] = (nhA2 + (nhA1 >> 32)) & m62;
518 
519  if (T_128BitTag)
520  {
521  nh0[1] = word32(nhB0);
522  nhB1 += (nhB0 >> 32);
523  nh1[1] = word32(nhB1);
524  nh2[1] = (nhB2 + (nhB1 >> 32)) & m62;
525  }
526 
527  #define a0 (((word32 *)(polyS+i*4))[2+NativeByteOrder::ToEnum()])
528  #define a1 (*(((word32 *)(polyS+i*4))+3-NativeByteOrder::ToEnum())) // workaround for GCC 3.2
529  #define a2 (((word32 *)(polyS+i*4))[0+NativeByteOrder::ToEnum()])
530  #define a3 (*(((word32 *)(polyS+i*4))+1-NativeByteOrder::ToEnum()))
531  #define aHi ((polyS+i*4)[0])
532  #define k0 (((word32 *)(polyS+i*4+2))[2+NativeByteOrder::ToEnum()])
533  #define k1 (*(((word32 *)(polyS+i*4+2))+3-NativeByteOrder::ToEnum()))
534  #define k2 (((word32 *)(polyS+i*4+2))[0+NativeByteOrder::ToEnum()])
535  #define k3 (*(((word32 *)(polyS+i*4+2))+1-NativeByteOrder::ToEnum()))
536  #define kHi ((polyS+i*4+2)[0])
537 
538  if (isFirstBlock)
539  {
540  isFirstBlock = false;
541  if (m_isFirstBlock)
542  {
543  m_isFirstBlock = false;
544  for (i=0; i<=(size_t)T_128BitTag; i++)
545  {
546  word64 t = (word64)nh0[i] + k0;
547  a0 = (word32)t;
548  t = (t >> 32) + nh1[i] + k1;
549  a1 = (word32)t;
550  aHi = (t >> 32) + nh2[i] + kHi;
551  }
552  continue;
553  }
554  }
555  for (i=0; i<=(size_t)T_128BitTag; i++)
556  {
557  word64 p, t;
558  word32 t2;
559 
560  p = MUL32(a3, 2*k3);
561  p += nh2[i];
562  p += MUL32(a0, k2);
563  p += MUL32(a1, k1);
564  p += MUL32(a2, k0);
565  t2 = (word32)p;
566  p >>= 32;
567  p += MUL32(a0, k3);
568  p += MUL32(a1, k2);
569  p += MUL32(a2, k1);
570  p += MUL32(a3, k0);
571  t = (word64(word32(p) & 0x7fffffff) << 32) | t2;
572  p >>= 31;
573  p += nh0[i];
574  p += MUL32(a0, k0);
575  p += MUL32(a1, 2*k3);
576  p += MUL32(a2, 2*k2);
577  p += MUL32(a3, 2*k1);
578  t2 = (word32)p;
579  p >>= 32;
580  p += nh1[i];
581  p += MUL32(a0, k1);
582  p += MUL32(a1, k0);
583  p += MUL32(a2, 2*k3);
584  p += MUL32(a3, 2*k2);
585  a0 = t2;
586  a1 = (word32)p;
587  aHi = (p >> 32) + t;
588  }
589 
590  #undef a0
591  #undef a1
592  #undef a2
593  #undef a3
594  #undef aHi
595  #undef k0
596  #undef k1
597  #undef k2
598  #undef k3
599  #undef kHi
600  #else // #if VMAC_BOOL_32BIT
601  if (isFirstBlock)
602  {
603  isFirstBlock = false;
604  if (m_isFirstBlock)
605  {
606  m_isFirstBlock = false;
607  #if VMAC_BOOL_WORD128
608  #define first_poly_step(a, kh, kl, m) a = (m & m126) + ((word128(kh) << 64) | kl)
609 
610  first_poly_step(a1, kh1, kl1, nhA);
611  if (T_128BitTag)
612  first_poly_step(a2, kh2, kl2, nhB);
613  #else
614  #define first_poly_step(ah, al, kh, kl, mh, ml) {\
615  mh &= m62;\
616  ADD128(mh, ml, kh, kl); \
617  ah = mh; al = ml;}
618 
619  first_poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
620  if (T_128BitTag)
621  first_poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
622  #endif
623  continue;
624  }
625  else
626  {
627  #if VMAC_BOOL_WORD128
628  a1 = (word128((polyS+0*4)[0]) << 64) | (polyS+0*4)[1];
629  #else
630  ah1=(polyS+0*4)[0]; al1=(polyS+0*4)[1];
631  #endif
632  if (T_128BitTag)
633  {
634  #if VMAC_BOOL_WORD128
635  a2 = (word128((polyS+1*4)[0]) << 64) | (polyS+1*4)[1];
636  #else
637  ah2=(polyS+1*4)[0]; al2=(polyS+1*4)[1];
638  #endif
639  }
640  }
641  }
642 
643  #if VMAC_BOOL_WORD128
644  #define poly_step(a, kh, kl, m) \
645  { word128 t1, t2, t3, t4;\
646  Multiply128(t2, a>>64, kl);\
647  Multiply128(t3, a, kh);\
648  Multiply128(t1, a, kl);\
649  Multiply128(t4, a>>64, 2*kh);\
650  t2 += t3;\
651  t4 += t1;\
652  t2 += t4>>64;\
653  a = (word128(word64(t2)&m63) << 64) | word64(t4);\
654  t2 *= 2;\
655  a += m & m126;\
656  a += t2>>64;}
657 
658  poly_step(a1, kh1, kl1, nhA);
659  if (T_128BitTag)
660  poly_step(a2, kh2, kl2, nhB);
661  #else
662  #define poly_step(ah, al, kh, kl, mh, ml) \
663  { word64 t1h, t1l, t2h, t2l, t3h, t3l, z=0; \
664  /* compute ab*cd, put bd into result registers */ \
665  MUL64(t2h,t2l,ah,kl); \
666  MUL64(t3h,t3l,al,kh); \
667  MUL64(t1h,t1l,ah,2*kh); \
668  MUL64(ah,al,al,kl); \
669  /* add together ad + bc */ \
670  ADD128(t2h,t2l,t3h,t3l); \
671  /* add 2 * ac to result */ \
672  ADD128(ah,al,t1h,t1l); \
673  /* now (ah,al), (t2l,2*t2h) need summing */ \
674  /* first add the high registers, carrying into t2h */ \
675  ADD128(t2h,ah,z,t2l); \
676  /* double t2h and add top bit of ah */ \
677  t2h += t2h + (ah >> 63); \
678  ah &= m63; \
679  /* now add the low registers */ \
680  mh &= m62; \
681  ADD128(ah,al,mh,ml); \
682  ADD128(ah,al,z,t2h); \
683  }
684 
685  poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
686  if (T_128BitTag)
687  poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
688  #endif
689  #endif // #if VMAC_BOOL_32BIT
690  } while (blocksRemainingInWord64);
691 
692  #if VMAC_BOOL_WORD128
693  (polyS+0*4)[0]=word64(a1>>64); (polyS+0*4)[1]=word64(a1);
694  if (T_128BitTag)
695  {
696  (polyS+1*4)[0]=word64(a2>>64); (polyS+1*4)[1]=word64(a2);
697  }
698  #elif !VMAC_BOOL_32BIT
699  (polyS+0*4)[0]=ah1; (polyS+0*4)[1]=al1;
700  if (T_128BitTag)
701  {
702  (polyS+1*4)[0]=ah2; (polyS+1*4)[1]=al2;
703  }
704  #endif
705 }
706 
707 inline void VMAC_Base::VHASH_Update(const word64 *data, size_t blocksRemainingInWord64)
708 {
709 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86
710  if (HasSSE2())
711  {
712  VHASH_Update_SSE2(data, blocksRemainingInWord64, 0);
713  if (m_is128)
714  VHASH_Update_SSE2(data, blocksRemainingInWord64, 1);
715  m_isFirstBlock = false;
716  }
717  else
718 #endif
719  {
720 #if defined(_MSC_VER) && _MSC_VER < 1300
721  VHASH_Update_Template(data, blocksRemainingInWord64);
722 #else
723  if (m_is128)
724  VHASH_Update_Template<true>(data, blocksRemainingInWord64);
725  else
726  VHASH_Update_Template<false>(data, blocksRemainingInWord64);
727 #endif
728  }
729 }
730 
731 size_t VMAC_Base::HashMultipleBlocks(const word64 *data, size_t length)
732 {
733  size_t remaining = ModPowerOf2(length, m_L1KeyLength);
734  VHASH_Update(data, (length-remaining)/8);
735  return remaining;
736 }
737 
738 static word64 L3Hash(const word64 *input, const word64 *l3Key, size_t len)
739 {
740  word64 rh, rl, t, z=0;
741  word64 p1 = input[0], p2 = input[1];
742  word64 k1 = l3Key[0], k2 = l3Key[1];
743 
744  /* fully reduce (p1,p2)+(len,0) mod p127 */
745  t = p1 >> 63;
746  p1 &= m63;
747  ADD128(p1, p2, len, t);
748  /* At this point, (p1,p2) is at most 2^127+(len<<64) */
749  t = (p1 > m63) + ((p1 == m63) & (p2 == m64));
750  ADD128(p1, p2, z, t);
751  p1 &= m63;
752 
753  /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
754  t = p1 + (p2 >> 32);
755  t += (t >> 32);
756  t += (word32)t > 0xfffffffeU;
757  p1 += (t >> 32);
758  p2 += (p1 << 32);
759 
760  /* compute (p1+k1)%p64 and (p2+k2)%p64 */
761  p1 += k1;
762  p1 += (0 - (p1 < k1)) & 257;
763  p2 += k2;
764  p2 += (0 - (p2 < k2)) & 257;
765 
766  /* compute (p1+k1)*(p2+k2)%p64 */
767  MUL64(rh, rl, p1, p2);
768  t = rh >> 56;
769  ADD128(t, rl, z, rh);
770  rh <<= 8;
771  ADD128(t, rl, z, rh);
772  t += t << 8;
773  rl += t;
774  rl += (0 - (rl < t)) & 257;
775  rl += (0 - (rl > p64-1)) & 257;
776  return rl;
777 }
778 
779 void VMAC_Base::TruncatedFinal(byte *mac, size_t size)
780 {
781  size_t len = ModPowerOf2(GetBitCountLo()/8, m_L1KeyLength);
782 
783  if (len)
784  {
785  memset(m_data()+len, 0, (0-len)%16);
786  VHASH_Update(DataBuf(), ((len+15)/16)*2);
787  len *= 8; // convert to bits
788  }
789  else if (m_isFirstBlock)
790  {
791  // special case for empty string
792  m_polyState()[0] = m_polyState()[2];
793  m_polyState()[1] = m_polyState()[3];
794  if (m_is128)
795  {
796  m_polyState()[4] = m_polyState()[6];
797  m_polyState()[5] = m_polyState()[7];
798  }
799  }
800 
801  if (m_is128)
802  {
803  word64 t[2];
804  t[0] = L3Hash(m_polyState(), m_l3Key(), len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad());
805  t[1] = L3Hash(m_polyState()+4, m_l3Key()+2, len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad()+8);
806  if (size == 16)
807  {
808  PutWord(false, BIG_ENDIAN_ORDER, mac, t[0]);
809  PutWord(false, BIG_ENDIAN_ORDER, mac+8, t[1]);
810  }
811  else
812  {
813  t[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[0]);
814  t[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[1]);
815  memcpy(mac, t, size);
816  }
817  }
818  else
819  {
820  word64 t = L3Hash(m_polyState(), m_l3Key(), len);
821  t += GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad() + (m_nonce()[IVSize()-1]&1) * 8);
822  if (size == 8)
823  PutWord(false, BIG_ENDIAN_ORDER, mac, t);
824  else
825  {
826  t = ConditionalByteReverse(BIG_ENDIAN_ORDER, t);
827  memcpy(mac, &t, size);
828  }
829  }
830 }
831 
832 NAMESPACE_END