Crypto++
integer.cpp
1 // integer.cpp - written and placed in the public domain by Wei Dai
2 // contains public domain code contributed by Alister Lee and Leonard Janke
3 
4 #include "pch.h"
5 
6 #ifndef CRYPTOPP_IMPORTS
7 
8 #include "integer.h"
9 #include "modarith.h"
10 #include "nbtheory.h"
11 #include "asn.h"
12 #include "oids.h"
13 #include "words.h"
14 #include "algparam.h"
15 #include "pubkey.h" // for P1363_KDF2
16 #include "sha.h"
17 #include "cpu.h"
18 
19 #include <iostream>
20 
21 #if _MSC_VER >= 1400
22  #include <intrin.h>
23 #endif
24 
25 #ifdef __DECCXX
26  #include <c_asm.h>
27 #endif
28 
29 #ifdef CRYPTOPP_MSVC6_NO_PP
30  #pragma message("You do not seem to have the Visual C++ Processor Pack installed, so use of SSE2 instructions will be disabled.")
31 #endif
32 
33 #define CRYPTOPP_INTEGER_SSE2 (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && CRYPTOPP_BOOL_X86)
34 
35 NAMESPACE_BEGIN(CryptoPP)
36 
37 bool AssignIntToInteger(const std::type_info &valueType, void *pInteger, const void *pInt)
38 {
39  if (valueType != typeid(Integer))
40  return false;
41  *reinterpret_cast<Integer *>(pInteger) = *reinterpret_cast<const int *>(pInt);
42  return true;
43 }
44 
45 inline static int Compare(const word *A, const word *B, size_t N)
46 {
47  while (N--)
48  if (A[N] > B[N])
49  return 1;
50  else if (A[N] < B[N])
51  return -1;
52 
53  return 0;
54 }
55 
56 inline static int Increment(word *A, size_t N, word B=1)
57 {
58  assert(N);
59  word t = A[0];
60  A[0] = t+B;
61  if (A[0] >= t)
62  return 0;
63  for (unsigned i=1; i<N; i++)
64  if (++A[i])
65  return 0;
66  return 1;
67 }
68 
69 inline static int Decrement(word *A, size_t N, word B=1)
70 {
71  assert(N);
72  word t = A[0];
73  A[0] = t-B;
74  if (A[0] <= t)
75  return 0;
76  for (unsigned i=1; i<N; i++)
77  if (A[i]--)
78  return 0;
79  return 1;
80 }
81 
82 static void TwosComplement(word *A, size_t N)
83 {
84  Decrement(A, N);
85  for (unsigned i=0; i<N; i++)
86  A[i] = ~A[i];
87 }
88 
89 static word AtomicInverseModPower2(word A)
90 {
91  assert(A%2==1);
92 
93  word R=A%8;
94 
95  for (unsigned i=3; i<WORD_BITS; i*=2)
96  R = R*(2-R*A);
97 
98  assert(R*A==1);
99  return R;
100 }
101 
102 // ********************************************************
103 
104 #if !defined(CRYPTOPP_NATIVE_DWORD_AVAILABLE) || (defined(__x86_64__) && defined(CRYPTOPP_WORD128_AVAILABLE))
105  #define Declare2Words(x) word x##0, x##1;
106  #define AssignWord(a, b) a##0 = b; a##1 = 0;
107  #define Add2WordsBy1(a, b, c) a##0 = b##0 + c; a##1 = b##1 + (a##0 < c);
108  #define LowWord(a) a##0
109  #define HighWord(a) a##1
110  #ifdef _MSC_VER
111  #define MultiplyWordsLoHi(p0, p1, a, b) p0 = _umul128(a, b, &p1);
112  #ifndef __INTEL_COMPILER
113  #define Double3Words(c, d) d##1 = __shiftleft128(d##0, d##1, 1); d##0 = __shiftleft128(c, d##0, 1); c *= 2;
114  #endif
115  #elif defined(__DECCXX)
116  #define MultiplyWordsLoHi(p0, p1, a, b) p0 = a*b; p1 = asm("umulh %a0, %a1, %v0", a, b);
117  #elif defined(__x86_64__)
118  #if defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5100
119  // Sun Studio's gcc-style inline assembly is heavily bugged as of version 5.9 Patch 124864-09 2008/12/16, but this one works
120  #define MultiplyWordsLoHi(p0, p1, a, b) asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "r"(b) : "cc");
121  #else
122  #define MultiplyWordsLoHi(p0, p1, a, b) asm ("mulq %3" : "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
123  #define MulAcc(c, d, a, b) asm ("mulq %6; addq %3, %0; adcq %4, %1; adcq $0, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1), "=a"(p0), "=d"(p1) : "a"(a), "g"(b) : "cc");
124  #define Double3Words(c, d) asm ("addq %0, %0; adcq %1, %1; adcq %2, %2;" : "+r"(c), "+r"(d##0), "+r"(d##1) : : "cc");
125  #define Acc2WordsBy1(a, b) asm ("addq %2, %0; adcq $0, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b) : "cc");
126  #define Acc2WordsBy2(a, b) asm ("addq %2, %0; adcq %3, %1;" : "+r"(a##0), "+r"(a##1) : "r"(b##0), "r"(b##1) : "cc");
127  #define Acc3WordsBy2(c, d, e) asm ("addq %5, %0; adcq %6, %1; adcq $0, %2;" : "+r"(c), "=r"(e##0), "=r"(e##1) : "1"(d##0), "2"(d##1), "r"(e##0), "r"(e##1) : "cc");
128  #endif
129  #endif
130  #define MultiplyWords(p, a, b) MultiplyWordsLoHi(p##0, p##1, a, b)
131  #ifndef Double3Words
132  #define Double3Words(c, d) d##1 = 2*d##1 + (d##0>>(WORD_BITS-1)); d##0 = 2*d##0 + (c>>(WORD_BITS-1)); c *= 2;
133  #endif
134  #ifndef Acc2WordsBy2
135  #define Acc2WordsBy2(a, b) a##0 += b##0; a##1 += a##0 < b##0; a##1 += b##1;
136  #endif
137  #define AddWithCarry(u, a, b) {word t = a+b; u##0 = t + u##1; u##1 = (t<a) + (u##0<t);}
138  #define SubtractWithBorrow(u, a, b) {word t = a-b; u##0 = t - u##1; u##1 = (t>a) + (u##0>t);}
139  #define GetCarry(u) u##1
140  #define GetBorrow(u) u##1
141 #else
142  #define Declare2Words(x) dword x;
143  #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
144  #define MultiplyWords(p, a, b) p = __emulu(a, b);
145  #else
146  #define MultiplyWords(p, a, b) p = (dword)a*b;
147  #endif
148  #define AssignWord(a, b) a = b;
149  #define Add2WordsBy1(a, b, c) a = b + c;
150  #define Acc2WordsBy2(a, b) a += b;
151  #define LowWord(a) word(a)
152  #define HighWord(a) word(a>>WORD_BITS)
153  #define Double3Words(c, d) d = 2*d + (c>>(WORD_BITS-1)); c *= 2;
154  #define AddWithCarry(u, a, b) u = dword(a) + b + GetCarry(u);
155  #define SubtractWithBorrow(u, a, b) u = dword(a) - b - GetBorrow(u);
156  #define GetCarry(u) HighWord(u)
157  #define GetBorrow(u) word(u>>(WORD_BITS*2-1))
158 #endif
159 #ifndef MulAcc
160  #define MulAcc(c, d, a, b) MultiplyWords(p, a, b); Acc2WordsBy1(p, c); c = LowWord(p); Acc2WordsBy1(d, HighWord(p));
161 #endif
162 #ifndef Acc2WordsBy1
163  #define Acc2WordsBy1(a, b) Add2WordsBy1(a, a, b)
164 #endif
165 #ifndef Acc3WordsBy2
166  #define Acc3WordsBy2(c, d, e) Acc2WordsBy1(e, c); c = LowWord(e); Add2WordsBy1(e, d, HighWord(e));
167 #endif
168 
169 class DWord
170 {
171 public:
172  DWord() {}
173 
174 #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
175  explicit DWord(word low)
176  {
177  m_whole = low;
178  }
179 #else
180  explicit DWord(word low)
181  {
182  m_halfs.low = low;
183  m_halfs.high = 0;
184  }
185 #endif
186 
187  DWord(word low, word high)
188  {
189  m_halfs.low = low;
190  m_halfs.high = high;
191  }
192 
193  static DWord Multiply(word a, word b)
194  {
195  DWord r;
196  #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
197  r.m_whole = (dword)a * b;
198  #elif defined(MultiplyWordsLoHi)
199  MultiplyWordsLoHi(r.m_halfs.low, r.m_halfs.high, a, b);
200  #endif
201  return r;
202  }
203 
204  static DWord MultiplyAndAdd(word a, word b, word c)
205  {
206  DWord r = Multiply(a, b);
207  return r += c;
208  }
209 
210  DWord & operator+=(word a)
211  {
212  #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
213  m_whole = m_whole + a;
214  #else
215  m_halfs.low += a;
216  m_halfs.high += (m_halfs.low < a);
217  #endif
218  return *this;
219  }
220 
221  DWord operator+(word a)
222  {
223  DWord r;
224  #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
225  r.m_whole = m_whole + a;
226  #else
227  r.m_halfs.low = m_halfs.low + a;
228  r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a);
229  #endif
230  return r;
231  }
232 
233  DWord operator-(DWord a)
234  {
235  DWord r;
236  #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
237  r.m_whole = m_whole - a.m_whole;
238  #else
239  r.m_halfs.low = m_halfs.low - a.m_halfs.low;
240  r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low);
241  #endif
242  return r;
243  }
244 
245  DWord operator-(word a)
246  {
247  DWord r;
248  #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
249  r.m_whole = m_whole - a;
250  #else
251  r.m_halfs.low = m_halfs.low - a;
252  r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low);
253  #endif
254  return r;
255  }
256 
257  // returns quotient, which must fit in a word
258  word operator/(word divisor);
259 
260  word operator%(word a);
261 
262  bool operator!() const
263  {
264  #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
265  return !m_whole;
266  #else
267  return !m_halfs.high && !m_halfs.low;
268  #endif
269  }
270 
271  word GetLowHalf() const {return m_halfs.low;}
272  word GetHighHalf() const {return m_halfs.high;}
273  word GetHighHalfAsBorrow() const {return 0-m_halfs.high;}
274 
275 private:
276  union
277  {
278  #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
279  dword m_whole;
280  #endif
281  struct
282  {
283  #ifdef IS_LITTLE_ENDIAN
284  word low;
285  word high;
286  #else
287  word high;
288  word low;
289  #endif
290  } m_halfs;
291  };
292 };
293 
294 class Word
295 {
296 public:
297  Word() {}
298 
299  Word(word value)
300  {
301  m_whole = value;
302  }
303 
304  Word(hword low, hword high)
305  {
306  m_whole = low | (word(high) << (WORD_BITS/2));
307  }
308 
309  static Word Multiply(hword a, hword b)
310  {
311  Word r;
312  r.m_whole = (word)a * b;
313  return r;
314  }
315 
316  Word operator-(Word a)
317  {
318  Word r;
319  r.m_whole = m_whole - a.m_whole;
320  return r;
321  }
322 
323  Word operator-(hword a)
324  {
325  Word r;
326  r.m_whole = m_whole - a;
327  return r;
328  }
329 
330  // returns quotient, which must fit in a word
331  hword operator/(hword divisor)
332  {
333  return hword(m_whole / divisor);
334  }
335 
336  bool operator!() const
337  {
338  return !m_whole;
339  }
340 
341  word GetWhole() const {return m_whole;}
342  hword GetLowHalf() const {return hword(m_whole);}
343  hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));}
344  hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));}
345 
346 private:
347  word m_whole;
348 };
349 
350 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
351 template <class S, class D>
352 S DivideThreeWordsByTwo(S *A, S B0, S B1, D *dummy=NULL)
353 {
354  // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
355  assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
356 
357  // estimate the quotient: do a 2 S by 1 S divide
358  S Q;
359  if (S(B1+1) == 0)
360  Q = A[2];
361  else if (B1 > 0)
362  Q = D(A[1], A[2]) / S(B1+1);
363  else
364  Q = D(A[0], A[1]) / B0;
365 
366  // now subtract Q*B from A
367  D p = D::Multiply(B0, Q);
368  D u = (D) A[0] - p.GetLowHalf();
369  A[0] = u.GetLowHalf();
370  u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q);
371  A[1] = u.GetLowHalf();
372  A[2] += u.GetHighHalf();
373 
374  // Q <= actual quotient, so fix it
375  while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
376  {
377  u = (D) A[0] - B0;
378  A[0] = u.GetLowHalf();
379  u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
380  A[1] = u.GetLowHalf();
381  A[2] += u.GetHighHalf();
382  Q++;
383  assert(Q); // shouldn't overflow
384  }
385 
386  return Q;
387 }
388 
389 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
390 template <class S, class D>
391 inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
392 {
393  if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
394  return D(Ah.GetLowHalf(), Ah.GetHighHalf());
395  else
396  {
397  S Q[2];
398  T[0] = Al.GetLowHalf();
399  T[1] = Al.GetHighHalf();
400  T[2] = Ah.GetLowHalf();
401  T[3] = Ah.GetHighHalf();
402  Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf());
403  Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
404  return D(Q[0], Q[1]);
405  }
406 }
407 
408 // returns quotient, which must fit in a word
409 inline word DWord::operator/(word a)
410 {
411  #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
412  return word(m_whole / a);
413  #else
414  hword r[4];
415  return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole();
416  #endif
417 }
418 
419 inline word DWord::operator%(word a)
420 {
421  #ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
422  return word(m_whole % a);
423  #else
424  if (a < (word(1) << (WORD_BITS/2)))
425  {
426  hword h = hword(a);
427  word r = m_halfs.high % h;
428  r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
429  return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h);
430  }
431  else
432  {
433  hword r[4];
434  DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a);
435  return Word(r[0], r[1]).GetWhole();
436  }
437  #endif
438 }
439 
440 // ********************************************************
441 
442 // use some tricks to share assembly code between MSVC and GCC
443 #if defined(__GNUC__)
444  #define AddPrologue \
445  int result; \
446  __asm__ __volatile__ \
447  ( \
448  ".intel_syntax noprefix;"
449  #define AddEpilogue \
450  ".att_syntax prefix;" \
451  : "=a" (result)\
452  : "d" (C), "a" (A), "D" (B), "c" (N) \
453  : "%esi", "memory", "cc" \
454  );\
455  return result;
456  #define MulPrologue \
457  __asm__ __volatile__ \
458  ( \
459  ".intel_syntax noprefix;" \
460  AS1( push ebx) \
461  AS2( mov ebx, edx)
462  #define MulEpilogue \
463  AS1( pop ebx) \
464  ".att_syntax prefix;" \
465  : \
466  : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B) \
467  : "%esi", "memory", "cc" \
468  );
469  #define SquPrologue MulPrologue
470  #define SquEpilogue \
471  AS1( pop ebx) \
472  ".att_syntax prefix;" \
473  : \
474  : "d" (s_maskLow16), "c" (C), "a" (A) \
475  : "%esi", "%edi", "memory", "cc" \
476  );
477  #define TopPrologue MulPrologue
478  #define TopEpilogue \
479  AS1( pop ebx) \
480  ".att_syntax prefix;" \
481  : \
482  : "d" (s_maskLow16), "c" (C), "a" (A), "D" (B), "S" (L) \
483  : "memory", "cc" \
484  );
485 #else
486  #define AddPrologue \
487  __asm push edi \
488  __asm push esi \
489  __asm mov eax, [esp+12] \
490  __asm mov edi, [esp+16]
491  #define AddEpilogue \
492  __asm pop esi \
493  __asm pop edi \
494  __asm ret 8
495 #if _MSC_VER < 1300
496  #define SaveEBX __asm push ebx
497  #define RestoreEBX __asm pop ebx
498 #else
499  #define SaveEBX
500  #define RestoreEBX
501 #endif
502  #define SquPrologue \
503  AS2( mov eax, A) \
504  AS2( mov ecx, C) \
505  SaveEBX \
506  AS2( lea ebx, s_maskLow16)
507  #define MulPrologue \
508  AS2( mov eax, A) \
509  AS2( mov edi, B) \
510  AS2( mov ecx, C) \
511  SaveEBX \
512  AS2( lea ebx, s_maskLow16)
513  #define TopPrologue \
514  AS2( mov eax, A) \
515  AS2( mov edi, B) \
516  AS2( mov ecx, C) \
517  AS2( mov esi, L) \
518  SaveEBX \
519  AS2( lea ebx, s_maskLow16)
520  #define SquEpilogue RestoreEBX
521  #define MulEpilogue RestoreEBX
522  #define TopEpilogue RestoreEBX
523 #endif
524 
525 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
526 extern "C" {
527 int Baseline_Add(size_t N, word *C, const word *A, const word *B);
528 int Baseline_Sub(size_t N, word *C, const word *A, const word *B);
529 }
530 #elif defined(CRYPTOPP_X64_ASM_AVAILABLE) && defined(__GNUC__) && defined(CRYPTOPP_WORD128_AVAILABLE)
531 int Baseline_Add(size_t N, word *C, const word *A, const word *B)
532 {
533  word result;
534  __asm__ __volatile__
535  (
536  ".intel_syntax;"
537  AS1( neg %1)
538  ASJ( jz, 1, f)
539  AS2( mov %0,[%3+8*%1])
540  AS2( add %0,[%4+8*%1])
541  AS2( mov [%2+8*%1],%0)
542  ASL(0)
543  AS2( mov %0,[%3+8*%1+8])
544  AS2( adc %0,[%4+8*%1+8])
545  AS2( mov [%2+8*%1+8],%0)
546  AS2( lea %1,[%1+2])
547  ASJ( jrcxz, 1, f)
548  AS2( mov %0,[%3+8*%1])
549  AS2( adc %0,[%4+8*%1])
550  AS2( mov [%2+8*%1],%0)
551  ASJ( jmp, 0, b)
552  ASL(1)
553  AS2( mov %0, 0)
554  AS2( adc %0, %0)
555  ".att_syntax;"
556  : "=&r" (result), "+c" (N)
557  : "r" (C+N), "r" (A+N), "r" (B+N)
558  : "memory", "cc"
559  );
560  return (int)result;
561 }
562 
563 int Baseline_Sub(size_t N, word *C, const word *A, const word *B)
564 {
565  word result;
566  __asm__ __volatile__
567  (
568  ".intel_syntax;"
569  AS1( neg %1)
570  ASJ( jz, 1, f)
571  AS2( mov %0,[%3+8*%1])
572  AS2( sub %0,[%4+8*%1])
573  AS2( mov [%2+8*%1],%0)
574  ASL(0)
575  AS2( mov %0,[%3+8*%1+8])
576  AS2( sbb %0,[%4+8*%1+8])
577  AS2( mov [%2+8*%1+8],%0)
578  AS2( lea %1,[%1+2])
579  ASJ( jrcxz, 1, f)
580  AS2( mov %0,[%3+8*%1])
581  AS2( sbb %0,[%4+8*%1])
582  AS2( mov [%2+8*%1],%0)
583  ASJ( jmp, 0, b)
584  ASL(1)
585  AS2( mov %0, 0)
586  AS2( adc %0, %0)
587  ".att_syntax;"
588  : "=&r" (result), "+c" (N)
589  : "r" (C+N), "r" (A+N), "r" (B+N)
590  : "memory", "cc"
591  );
592  return (int)result;
593 }
594 #elif defined(CRYPTOPP_X86_ASM_AVAILABLE) && CRYPTOPP_BOOL_X86
595 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
596 {
597  AddPrologue
598 
599  // now: eax = A, edi = B, edx = C, ecx = N
600  AS2( lea eax, [eax+4*ecx])
601  AS2( lea edi, [edi+4*ecx])
602  AS2( lea edx, [edx+4*ecx])
603 
604  AS1( neg ecx) // ecx is negative index
605  AS2( test ecx, 2) // this clears carry flag
606  ASJ( jz, 0, f)
607  AS2( sub ecx, 2)
608  ASJ( jmp, 1, f)
609 
610  ASL(0)
611  ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
612  AS2( mov esi,[eax+4*ecx])
613  AS2( adc esi,[edi+4*ecx])
614  AS2( mov [edx+4*ecx],esi)
615  AS2( mov esi,[eax+4*ecx+4])
616  AS2( adc esi,[edi+4*ecx+4])
617  AS2( mov [edx+4*ecx+4],esi)
618  ASL(1)
619  AS2( mov esi,[eax+4*ecx+8])
620  AS2( adc esi,[edi+4*ecx+8])
621  AS2( mov [edx+4*ecx+8],esi)
622  AS2( mov esi,[eax+4*ecx+12])
623  AS2( adc esi,[edi+4*ecx+12])
624  AS2( mov [edx+4*ecx+12],esi)
625 
626  AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
627  ASJ( jmp, 0, b)
628 
629  ASL(2)
630  AS2( mov eax, 0)
631  AS1( setc al) // store carry into eax (return result register)
632 
633  AddEpilogue
634 }
635 
636 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
637 {
638  AddPrologue
639 
640  // now: eax = A, edi = B, edx = C, ecx = N
641  AS2( lea eax, [eax+4*ecx])
642  AS2( lea edi, [edi+4*ecx])
643  AS2( lea edx, [edx+4*ecx])
644 
645  AS1( neg ecx) // ecx is negative index
646  AS2( test ecx, 2) // this clears carry flag
647  ASJ( jz, 0, f)
648  AS2( sub ecx, 2)
649  ASJ( jmp, 1, f)
650 
651  ASL(0)
652  ASJ( jecxz, 2, f) // loop until ecx overflows and becomes zero
653  AS2( mov esi,[eax+4*ecx])
654  AS2( sbb esi,[edi+4*ecx])
655  AS2( mov [edx+4*ecx],esi)
656  AS2( mov esi,[eax+4*ecx+4])
657  AS2( sbb esi,[edi+4*ecx+4])
658  AS2( mov [edx+4*ecx+4],esi)
659  ASL(1)
660  AS2( mov esi,[eax+4*ecx+8])
661  AS2( sbb esi,[edi+4*ecx+8])
662  AS2( mov [edx+4*ecx+8],esi)
663  AS2( mov esi,[eax+4*ecx+12])
664  AS2( sbb esi,[edi+4*ecx+12])
665  AS2( mov [edx+4*ecx+12],esi)
666 
667  AS2( lea ecx,[ecx+4]) // advance index, avoid inc which causes slowdown on Intel Core 2
668  ASJ( jmp, 0, b)
669 
670  ASL(2)
671  AS2( mov eax, 0)
672  AS1( setc al) // store carry into eax (return result register)
673 
674  AddEpilogue
675 }
676 
677 #if CRYPTOPP_INTEGER_SSE2
678 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Add(size_t N, word *C, const word *A, const word *B)
679 {
680  AddPrologue
681 
682  // now: eax = A, edi = B, edx = C, ecx = N
683  AS2( lea eax, [eax+4*ecx])
684  AS2( lea edi, [edi+4*ecx])
685  AS2( lea edx, [edx+4*ecx])
686 
687  AS1( neg ecx) // ecx is negative index
688  AS2( pxor mm2, mm2)
689  ASJ( jz, 2, f)
690  AS2( test ecx, 2) // this clears carry flag
691  ASJ( jz, 0, f)
692  AS2( sub ecx, 2)
693  ASJ( jmp, 1, f)
694 
695  ASL(0)
696  AS2( movd mm0, DWORD PTR [eax+4*ecx])
697  AS2( movd mm1, DWORD PTR [edi+4*ecx])
698  AS2( paddq mm0, mm1)
699  AS2( paddq mm2, mm0)
700  AS2( movd DWORD PTR [edx+4*ecx], mm2)
701  AS2( psrlq mm2, 32)
702 
703  AS2( movd mm0, DWORD PTR [eax+4*ecx+4])
704  AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
705  AS2( paddq mm0, mm1)
706  AS2( paddq mm2, mm0)
707  AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
708  AS2( psrlq mm2, 32)
709 
710  ASL(1)
711  AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
712  AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
713  AS2( paddq mm0, mm1)
714  AS2( paddq mm2, mm0)
715  AS2( movd DWORD PTR [edx+4*ecx+8], mm2)
716  AS2( psrlq mm2, 32)
717 
718  AS2( movd mm0, DWORD PTR [eax+4*ecx+12])
719  AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
720  AS2( paddq mm0, mm1)
721  AS2( paddq mm2, mm0)
722  AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
723  AS2( psrlq mm2, 32)
724 
725  AS2( add ecx, 4)
726  ASJ( jnz, 0, b)
727 
728  ASL(2)
729  AS2( movd eax, mm2)
730  AS1( emms)
731 
732  AddEpilogue
733 }
734 CRYPTOPP_NAKED int CRYPTOPP_FASTCALL SSE2_Sub(size_t N, word *C, const word *A, const word *B)
735 {
736  AddPrologue
737 
738  // now: eax = A, edi = B, edx = C, ecx = N
739  AS2( lea eax, [eax+4*ecx])
740  AS2( lea edi, [edi+4*ecx])
741  AS2( lea edx, [edx+4*ecx])
742 
743  AS1( neg ecx) // ecx is negative index
744  AS2( pxor mm2, mm2)
745  ASJ( jz, 2, f)
746  AS2( test ecx, 2) // this clears carry flag
747  ASJ( jz, 0, f)
748  AS2( sub ecx, 2)
749  ASJ( jmp, 1, f)
750 
751  ASL(0)
752  AS2( movd mm0, DWORD PTR [eax+4*ecx])
753  AS2( movd mm1, DWORD PTR [edi+4*ecx])
754  AS2( psubq mm0, mm1)
755  AS2( psubq mm0, mm2)
756  AS2( movd DWORD PTR [edx+4*ecx], mm0)
757  AS2( psrlq mm0, 63)
758 
759  AS2( movd mm2, DWORD PTR [eax+4*ecx+4])
760  AS2( movd mm1, DWORD PTR [edi+4*ecx+4])
761  AS2( psubq mm2, mm1)
762  AS2( psubq mm2, mm0)
763  AS2( movd DWORD PTR [edx+4*ecx+4], mm2)
764  AS2( psrlq mm2, 63)
765 
766  ASL(1)
767  AS2( movd mm0, DWORD PTR [eax+4*ecx+8])
768  AS2( movd mm1, DWORD PTR [edi+4*ecx+8])
769  AS2( psubq mm0, mm1)
770  AS2( psubq mm0, mm2)
771  AS2( movd DWORD PTR [edx+4*ecx+8], mm0)
772  AS2( psrlq mm0, 63)
773 
774  AS2( movd mm2, DWORD PTR [eax+4*ecx+12])
775  AS2( movd mm1, DWORD PTR [edi+4*ecx+12])
776  AS2( psubq mm2, mm1)
777  AS2( psubq mm2, mm0)
778  AS2( movd DWORD PTR [edx+4*ecx+12], mm2)
779  AS2( psrlq mm2, 63)
780 
781  AS2( add ecx, 4)
782  ASJ( jnz, 0, b)
783 
784  ASL(2)
785  AS2( movd eax, mm2)
786  AS1( emms)
787 
788  AddEpilogue
789 }
790 #endif // #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
791 #else
792 int CRYPTOPP_FASTCALL Baseline_Add(size_t N, word *C, const word *A, const word *B)
793 {
794  assert (N%2 == 0);
795 
796  Declare2Words(u);
797  AssignWord(u, 0);
798  for (size_t i=0; i<N; i+=2)
799  {
800  AddWithCarry(u, A[i], B[i]);
801  C[i] = LowWord(u);
802  AddWithCarry(u, A[i+1], B[i+1]);
803  C[i+1] = LowWord(u);
804  }
805  return int(GetCarry(u));
806 }
807 
808 int CRYPTOPP_FASTCALL Baseline_Sub(size_t N, word *C, const word *A, const word *B)
809 {
810  assert (N%2 == 0);
811 
812  Declare2Words(u);
813  AssignWord(u, 0);
814  for (size_t i=0; i<N; i+=2)
815  {
816  SubtractWithBorrow(u, A[i], B[i]);
817  C[i] = LowWord(u);
818  SubtractWithBorrow(u, A[i+1], B[i+1]);
819  C[i+1] = LowWord(u);
820  }
821  return int(GetBorrow(u));
822 }
823 #endif
824 
825 static word LinearMultiply(word *C, const word *A, word B, size_t N)
826 {
827  word carry=0;
828  for(unsigned i=0; i<N; i++)
829  {
830  Declare2Words(p);
831  MultiplyWords(p, A[i], B);
832  Acc2WordsBy1(p, carry);
833  C[i] = LowWord(p);
834  carry = HighWord(p);
835  }
836  return carry;
837 }
838 
839 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
840 
841 #define Mul_2 \
842  Mul_Begin(2) \
843  Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
844  Mul_End(1, 1)
845 
846 #define Mul_4 \
847  Mul_Begin(4) \
848  Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
849  Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
850  Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
851  Mul_SaveAcc(3, 1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
852  Mul_SaveAcc(4, 2, 3) Mul_Acc(3, 2) \
853  Mul_End(5, 3)
854 
855 #define Mul_8 \
856  Mul_Begin(8) \
857  Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
858  Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
859  Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
860  Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
861  Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
862  Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
863  Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
864  Mul_SaveAcc(7, 1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
865  Mul_SaveAcc(8, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
866  Mul_SaveAcc(9, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
867  Mul_SaveAcc(10, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
868  Mul_SaveAcc(11, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
869  Mul_SaveAcc(12, 6, 7) Mul_Acc(7, 6) \
870  Mul_End(13, 7)
871 
872 #define Mul_16 \
873  Mul_Begin(16) \
874  Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
875  Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
876  Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
877  Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
878  Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
879  Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
880  Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
881  Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
882  Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
883  Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
884  Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
885  Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
886  Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
887  Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
888  Mul_SaveAcc(14, 0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
889  Mul_SaveAcc(15, 1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
890  Mul_SaveAcc(16, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
891  Mul_SaveAcc(17, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
892  Mul_SaveAcc(18, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
893  Mul_SaveAcc(19, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
894  Mul_SaveAcc(20, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
895  Mul_SaveAcc(21, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
896  Mul_SaveAcc(22, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
897  Mul_SaveAcc(23, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
898  Mul_SaveAcc(24, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
899  Mul_SaveAcc(25, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
900  Mul_SaveAcc(26, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
901  Mul_SaveAcc(27, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
902  Mul_SaveAcc(28, 14, 15) Mul_Acc(15, 14) \
903  Mul_End(29, 15)
904 
905 #define Squ_2 \
906  Squ_Begin(2) \
907  Squ_End(2)
908 
909 #define Squ_4 \
910  Squ_Begin(4) \
911  Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
912  Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
913  Squ_SaveAcc(3, 1, 3) Squ_Diag(2) \
914  Squ_SaveAcc(4, 2, 3) Squ_NonDiag \
915  Squ_End(4)
916 
917 #define Squ_8 \
918  Squ_Begin(8) \
919  Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
920  Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
921  Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
922  Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
923  Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
924  Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
925  Squ_SaveAcc(7, 1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
926  Squ_SaveAcc(8, 2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
927  Squ_SaveAcc(9, 3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
928  Squ_SaveAcc(10, 4, 7) Squ_Acc(5, 6) Squ_NonDiag \
929  Squ_SaveAcc(11, 5, 7) Squ_Diag(6) \
930  Squ_SaveAcc(12, 6, 7) Squ_NonDiag \
931  Squ_End(8)
932 
933 #define Squ_16 \
934  Squ_Begin(16) \
935  Squ_SaveAcc(1, 0, 2) Squ_Diag(1) \
936  Squ_SaveAcc(2, 0, 3) Squ_Acc(1, 2) Squ_NonDiag \
937  Squ_SaveAcc(3, 0, 4) Squ_Acc(1, 3) Squ_Diag(2) \
938  Squ_SaveAcc(4, 0, 5) Squ_Acc(1, 4) Squ_Acc(2, 3) Squ_NonDiag \
939  Squ_SaveAcc(5, 0, 6) Squ_Acc(1, 5) Squ_Acc(2, 4) Squ_Diag(3) \
940  Squ_SaveAcc(6, 0, 7) Squ_Acc(1, 6) Squ_Acc(2, 5) Squ_Acc(3, 4) Squ_NonDiag \
941  Squ_SaveAcc(7, 0, 8) Squ_Acc(1, 7) Squ_Acc(2, 6) Squ_Acc(3, 5) Squ_Diag(4) \
942  Squ_SaveAcc(8, 0, 9) Squ_Acc(1, 8) Squ_Acc(2, 7) Squ_Acc(3, 6) Squ_Acc(4, 5) Squ_NonDiag \
943  Squ_SaveAcc(9, 0, 10) Squ_Acc(1, 9) Squ_Acc(2, 8) Squ_Acc(3, 7) Squ_Acc(4, 6) Squ_Diag(5) \
944  Squ_SaveAcc(10, 0, 11) Squ_Acc(1, 10) Squ_Acc(2, 9) Squ_Acc(3, 8) Squ_Acc(4, 7) Squ_Acc(5, 6) Squ_NonDiag \
945  Squ_SaveAcc(11, 0, 12) Squ_Acc(1, 11) Squ_Acc(2, 10) Squ_Acc(3, 9) Squ_Acc(4, 8) Squ_Acc(5, 7) Squ_Diag(6) \
946  Squ_SaveAcc(12, 0, 13) Squ_Acc(1, 12) Squ_Acc(2, 11) Squ_Acc(3, 10) Squ_Acc(4, 9) Squ_Acc(5, 8) Squ_Acc(6, 7) Squ_NonDiag \
947  Squ_SaveAcc(13, 0, 14) Squ_Acc(1, 13) Squ_Acc(2, 12) Squ_Acc(3, 11) Squ_Acc(4, 10) Squ_Acc(5, 9) Squ_Acc(6, 8) Squ_Diag(7) \
948  Squ_SaveAcc(14, 0, 15) Squ_Acc(1, 14) Squ_Acc(2, 13) Squ_Acc(3, 12) Squ_Acc(4, 11) Squ_Acc(5, 10) Squ_Acc(6, 9) Squ_Acc(7, 8) Squ_NonDiag \
949  Squ_SaveAcc(15, 1, 15) Squ_Acc(2, 14) Squ_Acc(3, 13) Squ_Acc(4, 12) Squ_Acc(5, 11) Squ_Acc(6, 10) Squ_Acc(7, 9) Squ_Diag(8) \
950  Squ_SaveAcc(16, 2, 15) Squ_Acc(3, 14) Squ_Acc(4, 13) Squ_Acc(5, 12) Squ_Acc(6, 11) Squ_Acc(7, 10) Squ_Acc(8, 9) Squ_NonDiag \
951  Squ_SaveAcc(17, 3, 15) Squ_Acc(4, 14) Squ_Acc(5, 13) Squ_Acc(6, 12) Squ_Acc(7, 11) Squ_Acc(8, 10) Squ_Diag(9) \
952  Squ_SaveAcc(18, 4, 15) Squ_Acc(5, 14) Squ_Acc(6, 13) Squ_Acc(7, 12) Squ_Acc(8, 11) Squ_Acc(9, 10) Squ_NonDiag \
953  Squ_SaveAcc(19, 5, 15) Squ_Acc(6, 14) Squ_Acc(7, 13) Squ_Acc(8, 12) Squ_Acc(9, 11) Squ_Diag(10) \
954  Squ_SaveAcc(20, 6, 15) Squ_Acc(7, 14) Squ_Acc(8, 13) Squ_Acc(9, 12) Squ_Acc(10, 11) Squ_NonDiag \
955  Squ_SaveAcc(21, 7, 15) Squ_Acc(8, 14) Squ_Acc(9, 13) Squ_Acc(10, 12) Squ_Diag(11) \
956  Squ_SaveAcc(22, 8, 15) Squ_Acc(9, 14) Squ_Acc(10, 13) Squ_Acc(11, 12) Squ_NonDiag \
957  Squ_SaveAcc(23, 9, 15) Squ_Acc(10, 14) Squ_Acc(11, 13) Squ_Diag(12) \
958  Squ_SaveAcc(24, 10, 15) Squ_Acc(11, 14) Squ_Acc(12, 13) Squ_NonDiag \
959  Squ_SaveAcc(25, 11, 15) Squ_Acc(12, 14) Squ_Diag(13) \
960  Squ_SaveAcc(26, 12, 15) Squ_Acc(13, 14) Squ_NonDiag \
961  Squ_SaveAcc(27, 13, 15) Squ_Diag(14) \
962  Squ_SaveAcc(28, 14, 15) Squ_NonDiag \
963  Squ_End(16)
964 
965 #define Bot_2 \
966  Mul_Begin(2) \
967  Bot_SaveAcc(0, 0, 1) Bot_Acc(1, 0) \
968  Bot_End(2)
969 
970 #define Bot_4 \
971  Mul_Begin(4) \
972  Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
973  Mul_SaveAcc(1, 2, 0) Mul_Acc(1, 1) Mul_Acc(0, 2) \
974  Bot_SaveAcc(2, 0, 3) Bot_Acc(1, 2) Bot_Acc(2, 1) Bot_Acc(3, 0) \
975  Bot_End(4)
976 
977 #define Bot_8 \
978  Mul_Begin(8) \
979  Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
980  Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
981  Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
982  Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
983  Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
984  Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
985  Bot_SaveAcc(6, 0, 7) Bot_Acc(1, 6) Bot_Acc(2, 5) Bot_Acc(3, 4) Bot_Acc(4, 3) Bot_Acc(5, 2) Bot_Acc(6, 1) Bot_Acc(7, 0) \
986  Bot_End(8)
987 
988 #define Bot_16 \
989  Mul_Begin(16) \
990  Mul_SaveAcc(0, 0, 1) Mul_Acc(1, 0) \
991  Mul_SaveAcc(1, 0, 2) Mul_Acc(1, 1) Mul_Acc(2, 0) \
992  Mul_SaveAcc(2, 0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
993  Mul_SaveAcc(3, 0, 4) Mul_Acc(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) Mul_Acc(4, 0) \
994  Mul_SaveAcc(4, 0, 5) Mul_Acc(1, 4) Mul_Acc(2, 3) Mul_Acc(3, 2) Mul_Acc(4, 1) Mul_Acc(5, 0) \
995  Mul_SaveAcc(5, 0, 6) Mul_Acc(1, 5) Mul_Acc(2, 4) Mul_Acc(3, 3) Mul_Acc(4, 2) Mul_Acc(5, 1) Mul_Acc(6, 0) \
996  Mul_SaveAcc(6, 0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
997  Mul_SaveAcc(7, 0, 8) Mul_Acc(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) Mul_Acc(8, 0) \
998  Mul_SaveAcc(8, 0, 9) Mul_Acc(1, 8) Mul_Acc(2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) Mul_Acc(8, 1) Mul_Acc(9, 0) \
999  Mul_SaveAcc(9, 0, 10) Mul_Acc(1, 9) Mul_Acc(2, 8) Mul_Acc(3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) Mul_Acc(8, 2) Mul_Acc(9, 1) Mul_Acc(10, 0) \
1000  Mul_SaveAcc(10, 0, 11) Mul_Acc(1, 10) Mul_Acc(2, 9) Mul_Acc(3, 8) Mul_Acc(4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) Mul_Acc(8, 3) Mul_Acc(9, 2) Mul_Acc(10, 1) Mul_Acc(11, 0) \
1001  Mul_SaveAcc(11, 0, 12) Mul_Acc(1, 11) Mul_Acc(2, 10) Mul_Acc(3, 9) Mul_Acc(4, 8) Mul_Acc(5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) Mul_Acc(8, 4) Mul_Acc(9, 3) Mul_Acc(10, 2) Mul_Acc(11, 1) Mul_Acc(12, 0) \
1002  Mul_SaveAcc(12, 0, 13) Mul_Acc(1, 12) Mul_Acc(2, 11) Mul_Acc(3, 10) Mul_Acc(4, 9) Mul_Acc(5, 8) Mul_Acc(6, 7) Mul_Acc(7, 6) Mul_Acc(8, 5) Mul_Acc(9, 4) Mul_Acc(10, 3) Mul_Acc(11, 2) Mul_Acc(12, 1) Mul_Acc(13, 0) \
1003  Mul_SaveAcc(13, 0, 14) Mul_Acc(1, 13) Mul_Acc(2, 12) Mul_Acc(3, 11) Mul_Acc(4, 10) Mul_Acc(5, 9) Mul_Acc(6, 8) Mul_Acc(7, 7) Mul_Acc(8, 6) Mul_Acc(9, 5) Mul_Acc(10, 4) Mul_Acc(11, 3) Mul_Acc(12, 2) Mul_Acc(13, 1) Mul_Acc(14, 0) \
1004  Bot_SaveAcc(14, 0, 15) Bot_Acc(1, 14) Bot_Acc(2, 13) Bot_Acc(3, 12) Bot_Acc(4, 11) Bot_Acc(5, 10) Bot_Acc(6, 9) Bot_Acc(7, 8) Bot_Acc(8, 7) Bot_Acc(9, 6) Bot_Acc(10, 5) Bot_Acc(11, 4) Bot_Acc(12, 3) Bot_Acc(13, 2) Bot_Acc(14, 1) Bot_Acc(15, 0) \
1005  Bot_End(16)
1006 
1007 #endif
1008 
1009 #if 0
1010 #define Mul_Begin(n) \
1011  Declare2Words(p) \
1012  Declare2Words(c) \
1013  Declare2Words(d) \
1014  MultiplyWords(p, A[0], B[0]) \
1015  AssignWord(c, LowWord(p)) \
1016  AssignWord(d, HighWord(p))
1017 
1018 #define Mul_Acc(i, j) \
1019  MultiplyWords(p, A[i], B[j]) \
1020  Acc2WordsBy1(c, LowWord(p)) \
1021  Acc2WordsBy1(d, HighWord(p))
1022 
1023 #define Mul_SaveAcc(k, i, j) \
1024  R[k] = LowWord(c); \
1025  Add2WordsBy1(c, d, HighWord(c)) \
1026  MultiplyWords(p, A[i], B[j]) \
1027  AssignWord(d, HighWord(p)) \
1028  Acc2WordsBy1(c, LowWord(p))
1029 
1030 #define Mul_End(n) \
1031  R[2*n-3] = LowWord(c); \
1032  Acc2WordsBy1(d, HighWord(c)) \
1033  MultiplyWords(p, A[n-1], B[n-1])\
1034  Acc2WordsBy2(d, p) \
1035  R[2*n-2] = LowWord(d); \
1036  R[2*n-1] = HighWord(d);
1037 
1038 #define Bot_SaveAcc(k, i, j) \
1039  R[k] = LowWord(c); \
1040  word e = LowWord(d) + HighWord(c); \
1041  e += A[i] * B[j];
1042 
1043 #define Bot_Acc(i, j) \
1044  e += A[i] * B[j];
1045 
1046 #define Bot_End(n) \
1047  R[n-1] = e;
1048 #else
1049 #define Mul_Begin(n) \
1050  Declare2Words(p) \
1051  word c; \
1052  Declare2Words(d) \
1053  MultiplyWords(p, A[0], B[0]) \
1054  c = LowWord(p); \
1055  AssignWord(d, HighWord(p))
1056 
1057 #define Mul_Acc(i, j) \
1058  MulAcc(c, d, A[i], B[j])
1059 
1060 #define Mul_SaveAcc(k, i, j) \
1061  R[k] = c; \
1062  c = LowWord(d); \
1063  AssignWord(d, HighWord(d)) \
1064  MulAcc(c, d, A[i], B[j])
1065 
1066 #define Mul_End(k, i) \
1067  R[k] = c; \
1068  MultiplyWords(p, A[i], B[i]) \
1069  Acc2WordsBy2(p, d) \
1070  R[k+1] = LowWord(p); \
1071  R[k+2] = HighWord(p);
1072 
1073 #define Bot_SaveAcc(k, i, j) \
1074  R[k] = c; \
1075  c = LowWord(d); \
1076  c += A[i] * B[j];
1077 
1078 #define Bot_Acc(i, j) \
1079  c += A[i] * B[j];
1080 
1081 #define Bot_End(n) \
1082  R[n-1] = c;
1083 #endif
1084 
1085 #define Squ_Begin(n) \
1086  Declare2Words(p) \
1087  word c; \
1088  Declare2Words(d) \
1089  Declare2Words(e) \
1090  MultiplyWords(p, A[0], A[0]) \
1091  R[0] = LowWord(p); \
1092  AssignWord(e, HighWord(p)) \
1093  MultiplyWords(p, A[0], A[1]) \
1094  c = LowWord(p); \
1095  AssignWord(d, HighWord(p)) \
1096  Squ_NonDiag \
1097 
1098 #define Squ_NonDiag \
1099  Double3Words(c, d)
1100 
1101 #define Squ_SaveAcc(k, i, j) \
1102  Acc3WordsBy2(c, d, e) \
1103  R[k] = c; \
1104  MultiplyWords(p, A[i], A[j]) \
1105  c = LowWord(p); \
1106  AssignWord(d, HighWord(p)) \
1107 
1108 #define Squ_Acc(i, j) \
1109  MulAcc(c, d, A[i], A[j])
1110 
1111 #define Squ_Diag(i) \
1112  Squ_NonDiag \
1113  MulAcc(c, d, A[i], A[i])
1114 
1115 #define Squ_End(n) \
1116  Acc3WordsBy2(c, d, e) \
1117  R[2*n-3] = c; \
1118  MultiplyWords(p, A[n-1], A[n-1])\
1119  Acc2WordsBy2(p, e) \
1120  R[2*n-2] = LowWord(p); \
1121  R[2*n-1] = HighWord(p);
1122 
1123 void Baseline_Multiply2(word *R, const word *A, const word *B)
1124 {
1125  Mul_2
1126 }
1127 
1128 void Baseline_Multiply4(word *R, const word *A, const word *B)
1129 {
1130  Mul_4
1131 }
1132 
1133 void Baseline_Multiply8(word *R, const word *A, const word *B)
1134 {
1135  Mul_8
1136 }
1137 
1138 void Baseline_Square2(word *R, const word *A)
1139 {
1140  Squ_2
1141 }
1142 
1143 void Baseline_Square4(word *R, const word *A)
1144 {
1145  Squ_4
1146 }
1147 
1148 void Baseline_Square8(word *R, const word *A)
1149 {
1150  Squ_8
1151 }
1152 
1153 void Baseline_MultiplyBottom2(word *R, const word *A, const word *B)
1154 {
1155  Bot_2
1156 }
1157 
1158 void Baseline_MultiplyBottom4(word *R, const word *A, const word *B)
1159 {
1160  Bot_4
1161 }
1162 
1163 void Baseline_MultiplyBottom8(word *R, const word *A, const word *B)
1164 {
1165  Bot_8
1166 }
1167 
1168 #define Top_Begin(n) \
1169  Declare2Words(p) \
1170  word c; \
1171  Declare2Words(d) \
1172  MultiplyWords(p, A[0], B[n-2]);\
1173  AssignWord(d, HighWord(p));
1174 
1175 #define Top_Acc(i, j) \
1176  MultiplyWords(p, A[i], B[j]);\
1177  Acc2WordsBy1(d, HighWord(p));
1178 
1179 #define Top_SaveAcc0(i, j) \
1180  c = LowWord(d); \
1181  AssignWord(d, HighWord(d)) \
1182  MulAcc(c, d, A[i], B[j])
1183 
1184 #define Top_SaveAcc1(i, j) \
1185  c = L<c; \
1186  Acc2WordsBy1(d, c); \
1187  c = LowWord(d); \
1188  AssignWord(d, HighWord(d)) \
1189  MulAcc(c, d, A[i], B[j])
1190 
1191 void Baseline_MultiplyTop2(word *R, const word *A, const word *B, word L)
1192 {
1193  word T[4];
1194  Baseline_Multiply2(T, A, B);
1195  R[0] = T[2];
1196  R[1] = T[3];
1197 }
1198 
1199 void Baseline_MultiplyTop4(word *R, const word *A, const word *B, word L)
1200 {
1201  Top_Begin(4)
1202  Top_Acc(1, 1) Top_Acc(2, 0) \
1203  Top_SaveAcc0(0, 3) Mul_Acc(1, 2) Mul_Acc(2, 1) Mul_Acc(3, 0) \
1204  Top_SaveAcc1(1, 3) Mul_Acc(2, 2) Mul_Acc(3, 1) \
1205  Mul_SaveAcc(0, 2, 3) Mul_Acc(3, 2) \
1206  Mul_End(1, 3)
1207 }
1208 
1209 void Baseline_MultiplyTop8(word *R, const word *A, const word *B, word L)
1210 {
1211  Top_Begin(8)
1212  Top_Acc(1, 5) Top_Acc(2, 4) Top_Acc(3, 3) Top_Acc(4, 2) Top_Acc(5, 1) Top_Acc(6, 0) \
1213  Top_SaveAcc0(0, 7) Mul_Acc(1, 6) Mul_Acc(2, 5) Mul_Acc(3, 4) Mul_Acc(4, 3) Mul_Acc(5, 2) Mul_Acc(6, 1) Mul_Acc(7, 0) \
1214  Top_SaveAcc1(1, 7) Mul_Acc(2, 6) Mul_Acc(3, 5) Mul_Acc(4, 4) Mul_Acc(5, 3) Mul_Acc(6, 2) Mul_Acc(7, 1) \
1215  Mul_SaveAcc(0, 2, 7) Mul_Acc(3, 6) Mul_Acc(4, 5) Mul_Acc(5, 4) Mul_Acc(6, 3) Mul_Acc(7, 2) \
1216  Mul_SaveAcc(1, 3, 7) Mul_Acc(4, 6) Mul_Acc(5, 5) Mul_Acc(6, 4) Mul_Acc(7, 3) \
1217  Mul_SaveAcc(2, 4, 7) Mul_Acc(5, 6) Mul_Acc(6, 5) Mul_Acc(7, 4) \
1218  Mul_SaveAcc(3, 5, 7) Mul_Acc(6, 6) Mul_Acc(7, 5) \
1219  Mul_SaveAcc(4, 6, 7) Mul_Acc(7, 6) \
1220  Mul_End(5, 7)
1221 }
1222 
1223 #if !CRYPTOPP_INTEGER_SSE2 // save memory by not compiling these functions when SSE2 is available
1224 void Baseline_Multiply16(word *R, const word *A, const word *B)
1225 {
1226  Mul_16
1227 }
1228 
1229 void Baseline_Square16(word *R, const word *A)
1230 {
1231  Squ_16
1232 }
1233 
1234 void Baseline_MultiplyBottom16(word *R, const word *A, const word *B)
1235 {
1236  Bot_16
1237 }
1238 
1239 void Baseline_MultiplyTop16(word *R, const word *A, const word *B, word L)
1240 {
1241  Top_Begin(16)
1242  Top_Acc(1, 13) Top_Acc(2, 12) Top_Acc(3, 11) Top_Acc(4, 10) Top_Acc(5, 9) Top_Acc(6, 8) Top_Acc(7, 7) Top_Acc(8, 6) Top_Acc(9, 5) Top_Acc(10, 4) Top_Acc(11, 3) Top_Acc(12, 2) Top_Acc(13, 1) Top_Acc(14, 0) \
1243  Top_SaveAcc0(0, 15) Mul_Acc(1, 14) Mul_Acc(2, 13) Mul_Acc(3, 12) Mul_Acc(4, 11) Mul_Acc(5, 10) Mul_Acc(6, 9) Mul_Acc(7, 8) Mul_Acc(8, 7) Mul_Acc(9, 6) Mul_Acc(10, 5) Mul_Acc(11, 4) Mul_Acc(12, 3) Mul_Acc(13, 2) Mul_Acc(14, 1) Mul_Acc(15, 0) \
1244  Top_SaveAcc1(1, 15) Mul_Acc(2, 14) Mul_Acc(3, 13) Mul_Acc(4, 12) Mul_Acc(5, 11) Mul_Acc(6, 10) Mul_Acc(7, 9) Mul_Acc(8, 8) Mul_Acc(9, 7) Mul_Acc(10, 6) Mul_Acc(11, 5) Mul_Acc(12, 4) Mul_Acc(13, 3) Mul_Acc(14, 2) Mul_Acc(15, 1) \
1245  Mul_SaveAcc(0, 2, 15) Mul_Acc(3, 14) Mul_Acc(4, 13) Mul_Acc(5, 12) Mul_Acc(6, 11) Mul_Acc(7, 10) Mul_Acc(8, 9) Mul_Acc(9, 8) Mul_Acc(10, 7) Mul_Acc(11, 6) Mul_Acc(12, 5) Mul_Acc(13, 4) Mul_Acc(14, 3) Mul_Acc(15, 2) \
1246  Mul_SaveAcc(1, 3, 15) Mul_Acc(4, 14) Mul_Acc(5, 13) Mul_Acc(6, 12) Mul_Acc(7, 11) Mul_Acc(8, 10) Mul_Acc(9, 9) Mul_Acc(10, 8) Mul_Acc(11, 7) Mul_Acc(12, 6) Mul_Acc(13, 5) Mul_Acc(14, 4) Mul_Acc(15, 3) \
1247  Mul_SaveAcc(2, 4, 15) Mul_Acc(5, 14) Mul_Acc(6, 13) Mul_Acc(7, 12) Mul_Acc(8, 11) Mul_Acc(9, 10) Mul_Acc(10, 9) Mul_Acc(11, 8) Mul_Acc(12, 7) Mul_Acc(13, 6) Mul_Acc(14, 5) Mul_Acc(15, 4) \
1248  Mul_SaveAcc(3, 5, 15) Mul_Acc(6, 14) Mul_Acc(7, 13) Mul_Acc(8, 12) Mul_Acc(9, 11) Mul_Acc(10, 10) Mul_Acc(11, 9) Mul_Acc(12, 8) Mul_Acc(13, 7) Mul_Acc(14, 6) Mul_Acc(15, 5) \
1249  Mul_SaveAcc(4, 6, 15) Mul_Acc(7, 14) Mul_Acc(8, 13) Mul_Acc(9, 12) Mul_Acc(10, 11) Mul_Acc(11, 10) Mul_Acc(12, 9) Mul_Acc(13, 8) Mul_Acc(14, 7) Mul_Acc(15, 6) \
1250  Mul_SaveAcc(5, 7, 15) Mul_Acc(8, 14) Mul_Acc(9, 13) Mul_Acc(10, 12) Mul_Acc(11, 11) Mul_Acc(12, 10) Mul_Acc(13, 9) Mul_Acc(14, 8) Mul_Acc(15, 7) \
1251  Mul_SaveAcc(6, 8, 15) Mul_Acc(9, 14) Mul_Acc(10, 13) Mul_Acc(11, 12) Mul_Acc(12, 11) Mul_Acc(13, 10) Mul_Acc(14, 9) Mul_Acc(15, 8) \
1252  Mul_SaveAcc(7, 9, 15) Mul_Acc(10, 14) Mul_Acc(11, 13) Mul_Acc(12, 12) Mul_Acc(13, 11) Mul_Acc(14, 10) Mul_Acc(15, 9) \
1253  Mul_SaveAcc(8, 10, 15) Mul_Acc(11, 14) Mul_Acc(12, 13) Mul_Acc(13, 12) Mul_Acc(14, 11) Mul_Acc(15, 10) \
1254  Mul_SaveAcc(9, 11, 15) Mul_Acc(12, 14) Mul_Acc(13, 13) Mul_Acc(14, 12) Mul_Acc(15, 11) \
1255  Mul_SaveAcc(10, 12, 15) Mul_Acc(13, 14) Mul_Acc(14, 13) Mul_Acc(15, 12) \
1256  Mul_SaveAcc(11, 13, 15) Mul_Acc(14, 14) Mul_Acc(15, 13) \
1257  Mul_SaveAcc(12, 14, 15) Mul_Acc(15, 14) \
1258  Mul_End(13, 15)
1259 }
1260 #endif
1261 
1262 // ********************************************************
1263 
1264 #if CRYPTOPP_INTEGER_SSE2
1265 
1266 CRYPTOPP_ALIGN_DATA(16) static const word32 s_maskLow16[4] CRYPTOPP_SECTION_ALIGN16 = {0xffff,0xffff,0xffff,0xffff};
1267 
1268 #undef Mul_Begin
1269 #undef Mul_Acc
1270 #undef Top_Begin
1271 #undef Top_Acc
1272 #undef Squ_Acc
1273 #undef Squ_NonDiag
1274 #undef Squ_Diag
1275 #undef Squ_SaveAcc
1276 #undef Squ_Begin
1277 #undef Mul_SaveAcc
1278 #undef Bot_Acc
1279 #undef Bot_SaveAcc
1280 #undef Bot_End
1281 #undef Squ_End
1282 #undef Mul_End
1283 
1284 #define SSE2_FinalSave(k) \
1285  AS2( psllq xmm5, 16) \
1286  AS2( paddq xmm4, xmm5) \
1287  AS2( movq QWORD PTR [ecx+8*(k)], xmm4)
1288 
1289 #define SSE2_SaveShift(k) \
1290  AS2( movq xmm0, xmm6) \
1291  AS2( punpckhqdq xmm6, xmm0) \
1292  AS2( movq xmm1, xmm7) \
1293  AS2( punpckhqdq xmm7, xmm1) \
1294  AS2( paddd xmm6, xmm0) \
1295  AS2( pslldq xmm6, 4) \
1296  AS2( paddd xmm7, xmm1) \
1297  AS2( paddd xmm4, xmm6) \
1298  AS2( pslldq xmm7, 4) \
1299  AS2( movq xmm6, xmm4) \
1300  AS2( paddd xmm5, xmm7) \
1301  AS2( movq xmm7, xmm5) \
1302  AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
1303  AS2( psrlq xmm6, 16) \
1304  AS2( paddq xmm6, xmm7) \
1305  AS2( punpckhqdq xmm4, xmm0) \
1306  AS2( punpckhqdq xmm5, xmm0) \
1307  AS2( movq QWORD PTR [ecx+8*(k)+2], xmm6) \
1308  AS2( psrlq xmm6, 3*16) \
1309  AS2( paddd xmm4, xmm6) \
1310 
1311 #define Squ_SSE2_SaveShift(k) \
1312  AS2( movq xmm0, xmm6) \
1313  AS2( punpckhqdq xmm6, xmm0) \
1314  AS2( movq xmm1, xmm7) \
1315  AS2( punpckhqdq xmm7, xmm1) \
1316  AS2( paddd xmm6, xmm0) \
1317  AS2( pslldq xmm6, 4) \
1318  AS2( paddd xmm7, xmm1) \
1319  AS2( paddd xmm4, xmm6) \
1320  AS2( pslldq xmm7, 4) \
1321  AS2( movhlps xmm6, xmm4) \
1322  AS2( movd DWORD PTR [ecx+8*(k)], xmm4) \
1323  AS2( paddd xmm5, xmm7) \
1324  AS2( movhps QWORD PTR [esp+12], xmm5)\
1325  AS2( psrlq xmm4, 16) \
1326  AS2( paddq xmm4, xmm5) \
1327  AS2( movq QWORD PTR [ecx+8*(k)+2], xmm4) \
1328  AS2( psrlq xmm4, 3*16) \
1329  AS2( paddd xmm4, xmm6) \
1330  AS2( movq QWORD PTR [esp+4], xmm4)\
1331 
1332 #define SSE2_FirstMultiply(i) \
1333  AS2( movdqa xmm7, [esi+(i)*16])\
1334  AS2( movdqa xmm5, [edi-(i)*16])\
1335  AS2( pmuludq xmm5, xmm7) \
1336  AS2( movdqa xmm4, [ebx])\
1337  AS2( movdqa xmm6, xmm4) \
1338  AS2( pand xmm4, xmm5) \
1339  AS2( psrld xmm5, 16) \
1340  AS2( pmuludq xmm7, [edx-(i)*16])\
1341  AS2( pand xmm6, xmm7) \
1342  AS2( psrld xmm7, 16)
1343 
1344 #define Squ_Begin(n) \
1345  SquPrologue \
1346  AS2( mov esi, esp)\
1347  AS2( and esp, 0xfffffff0)\
1348  AS2( lea edi, [esp-32*n])\
1349  AS2( sub esp, 32*n+16)\
1350  AS1( push esi)\
1351  AS2( mov esi, edi) \
1352  AS2( xor edx, edx) \
1353  ASL(1) \
1354  ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1355  ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1356  AS2( movdqa [edi+2*edx], xmm0) \
1357  AS2( psrlq xmm0, 32) \
1358  AS2( movdqa [edi+2*edx+16], xmm0) \
1359  AS2( movdqa [edi+16*n+2*edx], xmm1) \
1360  AS2( psrlq xmm1, 32) \
1361  AS2( movdqa [edi+16*n+2*edx+16], xmm1) \
1362  AS2( add edx, 16) \
1363  AS2( cmp edx, 8*(n)) \
1364  ASJ( jne, 1, b) \
1365  AS2( lea edx, [edi+16*n])\
1366  SSE2_FirstMultiply(0) \
1367 
1368 #define Squ_Acc(i) \
1369  ASL(LSqu##i) \
1370  AS2( movdqa xmm1, [esi+(i)*16]) \
1371  AS2( movdqa xmm0, [edi-(i)*16]) \
1372  AS2( movdqa xmm2, [ebx]) \
1373  AS2( pmuludq xmm0, xmm1) \
1374  AS2( pmuludq xmm1, [edx-(i)*16]) \
1375  AS2( movdqa xmm3, xmm2) \
1376  AS2( pand xmm2, xmm0) \
1377  AS2( psrld xmm0, 16) \
1378  AS2( paddd xmm4, xmm2) \
1379  AS2( paddd xmm5, xmm0) \
1380  AS2( pand xmm3, xmm1) \
1381  AS2( psrld xmm1, 16) \
1382  AS2( paddd xmm6, xmm3) \
1383  AS2( paddd xmm7, xmm1) \
1384 
1385 #define Squ_Acc1(i)
1386 #define Squ_Acc2(i) ASC(call, LSqu##i)
1387 #define Squ_Acc3(i) Squ_Acc2(i)
1388 #define Squ_Acc4(i) Squ_Acc2(i)
1389 #define Squ_Acc5(i) Squ_Acc2(i)
1390 #define Squ_Acc6(i) Squ_Acc2(i)
1391 #define Squ_Acc7(i) Squ_Acc2(i)
1392 #define Squ_Acc8(i) Squ_Acc2(i)
1393 
1394 #define SSE2_End(E, n) \
1395  SSE2_SaveShift(2*(n)-3) \
1396  AS2( movdqa xmm7, [esi+16]) \
1397  AS2( movdqa xmm0, [edi]) \
1398  AS2( pmuludq xmm0, xmm7) \
1399  AS2( movdqa xmm2, [ebx]) \
1400  AS2( pmuludq xmm7, [edx]) \
1401  AS2( movdqa xmm6, xmm2) \
1402  AS2( pand xmm2, xmm0) \
1403  AS2( psrld xmm0, 16) \
1404  AS2( paddd xmm4, xmm2) \
1405  AS2( paddd xmm5, xmm0) \
1406  AS2( pand xmm6, xmm7) \
1407  AS2( psrld xmm7, 16) \
1408  SSE2_SaveShift(2*(n)-2) \
1409  SSE2_FinalSave(2*(n)-1) \
1410  AS1( pop esp)\
1411  E
1412 
1413 #define Squ_End(n) SSE2_End(SquEpilogue, n)
1414 #define Mul_End(n) SSE2_End(MulEpilogue, n)
1415 #define Top_End(n) SSE2_End(TopEpilogue, n)
1416 
1417 #define Squ_Column1(k, i) \
1418  Squ_SSE2_SaveShift(k) \
1419  AS2( add esi, 16) \
1420  SSE2_FirstMultiply(1)\
1421  Squ_Acc##i(i) \
1422  AS2( paddd xmm4, xmm4) \
1423  AS2( paddd xmm5, xmm5) \
1424  AS2( movdqa xmm3, [esi]) \
1425  AS2( movq xmm1, QWORD PTR [esi+8]) \
1426  AS2( pmuludq xmm1, xmm3) \
1427  AS2( pmuludq xmm3, xmm3) \
1428  AS2( movdqa xmm0, [ebx])\
1429  AS2( movdqa xmm2, xmm0) \
1430  AS2( pand xmm0, xmm1) \
1431  AS2( psrld xmm1, 16) \
1432  AS2( paddd xmm6, xmm0) \
1433  AS2( paddd xmm7, xmm1) \
1434  AS2( pand xmm2, xmm3) \
1435  AS2( psrld xmm3, 16) \
1436  AS2( paddd xmm6, xmm6) \
1437  AS2( paddd xmm7, xmm7) \
1438  AS2( paddd xmm4, xmm2) \
1439  AS2( paddd xmm5, xmm3) \
1440  AS2( movq xmm0, QWORD PTR [esp+4])\
1441  AS2( movq xmm1, QWORD PTR [esp+12])\
1442  AS2( paddd xmm4, xmm0)\
1443  AS2( paddd xmm5, xmm1)\
1444 
1445 #define Squ_Column0(k, i) \
1446  Squ_SSE2_SaveShift(k) \
1447  AS2( add edi, 16) \
1448  AS2( add edx, 16) \
1449  SSE2_FirstMultiply(1)\
1450  Squ_Acc##i(i) \
1451  AS2( paddd xmm6, xmm6) \
1452  AS2( paddd xmm7, xmm7) \
1453  AS2( paddd xmm4, xmm4) \
1454  AS2( paddd xmm5, xmm5) \
1455  AS2( movq xmm0, QWORD PTR [esp+4])\
1456  AS2( movq xmm1, QWORD PTR [esp+12])\
1457  AS2( paddd xmm4, xmm0)\
1458  AS2( paddd xmm5, xmm1)\
1459 
1460 #define SSE2_MulAdd45 \
1461  AS2( movdqa xmm7, [esi]) \
1462  AS2( movdqa xmm0, [edi]) \
1463  AS2( pmuludq xmm0, xmm7) \
1464  AS2( movdqa xmm2, [ebx]) \
1465  AS2( pmuludq xmm7, [edx]) \
1466  AS2( movdqa xmm6, xmm2) \
1467  AS2( pand xmm2, xmm0) \
1468  AS2( psrld xmm0, 16) \
1469  AS2( paddd xmm4, xmm2) \
1470  AS2( paddd xmm5, xmm0) \
1471  AS2( pand xmm6, xmm7) \
1472  AS2( psrld xmm7, 16)
1473 
1474 #define Mul_Begin(n) \
1475  MulPrologue \
1476  AS2( mov esi, esp)\
1477  AS2( and esp, 0xfffffff0)\
1478  AS2( sub esp, 48*n+16)\
1479  AS1( push esi)\
1480  AS2( xor edx, edx) \
1481  ASL(1) \
1482  ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1483  ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1484  ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1485  AS2( movdqa [esp+20+2*edx], xmm0) \
1486  AS2( psrlq xmm0, 32) \
1487  AS2( movdqa [esp+20+2*edx+16], xmm0) \
1488  AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
1489  AS2( psrlq xmm1, 32) \
1490  AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
1491  AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
1492  AS2( psrlq xmm2, 32) \
1493  AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
1494  AS2( add edx, 16) \
1495  AS2( cmp edx, 8*(n)) \
1496  ASJ( jne, 1, b) \
1497  AS2( lea edi, [esp+20])\
1498  AS2( lea edx, [esp+20+16*n])\
1499  AS2( lea esi, [esp+20+32*n])\
1500  SSE2_FirstMultiply(0) \
1501 
1502 #define Mul_Acc(i) \
1503  ASL(LMul##i) \
1504  AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
1505  AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
1506  AS2( movdqa xmm2, [ebx]) \
1507  AS2( pmuludq xmm0, xmm1) \
1508  AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1509  AS2( movdqa xmm3, xmm2) \
1510  AS2( pand xmm2, xmm0) \
1511  AS2( psrld xmm0, 16) \
1512  AS2( paddd xmm4, xmm2) \
1513  AS2( paddd xmm5, xmm0) \
1514  AS2( pand xmm3, xmm1) \
1515  AS2( psrld xmm1, 16) \
1516  AS2( paddd xmm6, xmm3) \
1517  AS2( paddd xmm7, xmm1) \
1518 
1519 #define Mul_Acc1(i)
1520 #define Mul_Acc2(i) ASC(call, LMul##i)
1521 #define Mul_Acc3(i) Mul_Acc2(i)
1522 #define Mul_Acc4(i) Mul_Acc2(i)
1523 #define Mul_Acc5(i) Mul_Acc2(i)
1524 #define Mul_Acc6(i) Mul_Acc2(i)
1525 #define Mul_Acc7(i) Mul_Acc2(i)
1526 #define Mul_Acc8(i) Mul_Acc2(i)
1527 #define Mul_Acc9(i) Mul_Acc2(i)
1528 #define Mul_Acc10(i) Mul_Acc2(i)
1529 #define Mul_Acc11(i) Mul_Acc2(i)
1530 #define Mul_Acc12(i) Mul_Acc2(i)
1531 #define Mul_Acc13(i) Mul_Acc2(i)
1532 #define Mul_Acc14(i) Mul_Acc2(i)
1533 #define Mul_Acc15(i) Mul_Acc2(i)
1534 #define Mul_Acc16(i) Mul_Acc2(i)
1535 
1536 #define Mul_Column1(k, i) \
1537  SSE2_SaveShift(k) \
1538  AS2( add esi, 16) \
1539  SSE2_MulAdd45\
1540  Mul_Acc##i(i) \
1541 
1542 #define Mul_Column0(k, i) \
1543  SSE2_SaveShift(k) \
1544  AS2( add edi, 16) \
1545  AS2( add edx, 16) \
1546  SSE2_MulAdd45\
1547  Mul_Acc##i(i) \
1548 
1549 #define Bot_Acc(i) \
1550  AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
1551  AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
1552  AS2( pmuludq xmm0, xmm1) \
1553  AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1554  AS2( paddq xmm4, xmm0) \
1555  AS2( paddd xmm6, xmm1)
1556 
1557 #define Bot_SaveAcc(k) \
1558  SSE2_SaveShift(k) \
1559  AS2( add edi, 16) \
1560  AS2( add edx, 16) \
1561  AS2( movdqa xmm6, [esi]) \
1562  AS2( movdqa xmm0, [edi]) \
1563  AS2( pmuludq xmm0, xmm6) \
1564  AS2( paddq xmm4, xmm0) \
1565  AS2( psllq xmm5, 16) \
1566  AS2( paddq xmm4, xmm5) \
1567  AS2( pmuludq xmm6, [edx])
1568 
1569 #define Bot_End(n) \
1570  AS2( movhlps xmm7, xmm6) \
1571  AS2( paddd xmm6, xmm7) \
1572  AS2( psllq xmm6, 32) \
1573  AS2( paddd xmm4, xmm6) \
1574  AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \
1575  AS1( pop esp)\
1576  MulEpilogue
1577 
1578 #define Top_Begin(n) \
1579  TopPrologue \
1580  AS2( mov edx, esp)\
1581  AS2( and esp, 0xfffffff0)\
1582  AS2( sub esp, 48*n+16)\
1583  AS1( push edx)\
1584  AS2( xor edx, edx) \
1585  ASL(1) \
1586  ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
1587  ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
1588  ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
1589  AS2( movdqa [esp+20+2*edx], xmm0) \
1590  AS2( psrlq xmm0, 32) \
1591  AS2( movdqa [esp+20+2*edx+16], xmm0) \
1592  AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
1593  AS2( psrlq xmm1, 32) \
1594  AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
1595  AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
1596  AS2( psrlq xmm2, 32) \
1597  AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
1598  AS2( add edx, 16) \
1599  AS2( cmp edx, 8*(n)) \
1600  ASJ( jne, 1, b) \
1601  AS2( mov eax, esi) \
1602  AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\
1603  AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\
1604  AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\
1605  AS2( pxor xmm4, xmm4)\
1606  AS2( pxor xmm5, xmm5)
1607 
1608 #define Top_Acc(i) \
1609  AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \
1610  AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
1611  AS2( psrlq xmm0, 48) \
1612  AS2( paddd xmm5, xmm0)\
1613 
1614 #define Top_Column0(i) \
1615  AS2( psllq xmm5, 32) \
1616  AS2( add edi, 16) \
1617  AS2( add edx, 16) \
1618  SSE2_MulAdd45\
1619  Mul_Acc##i(i) \
1620 
1621 #define Top_Column1(i) \
1622  SSE2_SaveShift(0) \
1623  AS2( add esi, 16) \
1624  SSE2_MulAdd45\
1625  Mul_Acc##i(i) \
1626  AS2( shr eax, 16) \
1627  AS2( movd xmm0, eax)\
1628  AS2( movd xmm1, [ecx+4])\
1629  AS2( psrld xmm1, 16)\
1630  AS2( pcmpgtd xmm1, xmm0)\
1631  AS2( psrld xmm1, 31)\
1632  AS2( paddd xmm4, xmm1)\
1633 
1634 void SSE2_Square4(word *C, const word *A)
1635 {
1636  Squ_Begin(2)
1637  Squ_Column0(0, 1)
1638  Squ_End(2)
1639 }
1640 
1641 void SSE2_Square8(word *C, const word *A)
1642 {
1643  Squ_Begin(4)
1644 #ifndef __GNUC__
1645  ASJ( jmp, 0, f)
1646  Squ_Acc(2)
1647  AS1( ret) ASL(0)
1648 #endif
1649  Squ_Column0(0, 1)
1650  Squ_Column1(1, 1)
1651  Squ_Column0(2, 2)
1652  Squ_Column1(3, 1)
1653  Squ_Column0(4, 1)
1654  Squ_End(4)
1655 }
1656 
1657 void SSE2_Square16(word *C, const word *A)
1658 {
1659  Squ_Begin(8)
1660 #ifndef __GNUC__
1661  ASJ( jmp, 0, f)
1662  Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1663  AS1( ret) ASL(0)
1664 #endif
1665  Squ_Column0(0, 1)
1666  Squ_Column1(1, 1)
1667  Squ_Column0(2, 2)
1668  Squ_Column1(3, 2)
1669  Squ_Column0(4, 3)
1670  Squ_Column1(5, 3)
1671  Squ_Column0(6, 4)
1672  Squ_Column1(7, 3)
1673  Squ_Column0(8, 3)
1674  Squ_Column1(9, 2)
1675  Squ_Column0(10, 2)
1676  Squ_Column1(11, 1)
1677  Squ_Column0(12, 1)
1678  Squ_End(8)
1679 }
1680 
1681 void SSE2_Square32(word *C, const word *A)
1682 {
1683  Squ_Begin(16)
1684  ASJ( jmp, 0, f)
1685  Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
1686  AS1( ret) ASL(0)
1687  Squ_Column0(0, 1)
1688  Squ_Column1(1, 1)
1689  Squ_Column0(2, 2)
1690  Squ_Column1(3, 2)
1691  Squ_Column0(4, 3)
1692  Squ_Column1(5, 3)
1693  Squ_Column0(6, 4)
1694  Squ_Column1(7, 4)
1695  Squ_Column0(8, 5)
1696  Squ_Column1(9, 5)
1697  Squ_Column0(10, 6)
1698  Squ_Column1(11, 6)
1699  Squ_Column0(12, 7)
1700  Squ_Column1(13, 7)
1701  Squ_Column0(14, 8)
1702  Squ_Column1(15, 7)
1703  Squ_Column0(16, 7)
1704  Squ_Column1(17, 6)
1705  Squ_Column0(18, 6)
1706  Squ_Column1(19, 5)
1707  Squ_Column0(20, 5)
1708  Squ_Column1(21, 4)
1709  Squ_Column0(22, 4)
1710  Squ_Column1(23, 3)
1711  Squ_Column0(24, 3)
1712  Squ_Column1(25, 2)
1713  Squ_Column0(26, 2)
1714  Squ_Column1(27, 1)
1715  Squ_Column0(28, 1)
1716  Squ_End(16)
1717 }
1718 
1719 void SSE2_Multiply4(word *C, const word *A, const word *B)
1720 {
1721  Mul_Begin(2)
1722 #ifndef __GNUC__
1723  ASJ( jmp, 0, f)
1724  Mul_Acc(2)
1725  AS1( ret) ASL(0)
1726 #endif
1727  Mul_Column0(0, 2)
1728  Mul_End(2)
1729 }
1730 
1731 void SSE2_Multiply8(word *C, const word *A, const word *B)
1732 {
1733  Mul_Begin(4)
1734 #ifndef __GNUC__
1735  ASJ( jmp, 0, f)
1736  Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1737  AS1( ret) ASL(0)
1738 #endif
1739  Mul_Column0(0, 2)
1740  Mul_Column1(1, 3)
1741  Mul_Column0(2, 4)
1742  Mul_Column1(3, 3)
1743  Mul_Column0(4, 2)
1744  Mul_End(4)
1745 }
1746 
1747 void SSE2_Multiply16(word *C, const word *A, const word *B)
1748 {
1749  Mul_Begin(8)
1750 #ifndef __GNUC__
1751  ASJ( jmp, 0, f)
1752  Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1753  AS1( ret) ASL(0)
1754 #endif
1755  Mul_Column0(0, 2)
1756  Mul_Column1(1, 3)
1757  Mul_Column0(2, 4)
1758  Mul_Column1(3, 5)
1759  Mul_Column0(4, 6)
1760  Mul_Column1(5, 7)
1761  Mul_Column0(6, 8)
1762  Mul_Column1(7, 7)
1763  Mul_Column0(8, 6)
1764  Mul_Column1(9, 5)
1765  Mul_Column0(10, 4)
1766  Mul_Column1(11, 3)
1767  Mul_Column0(12, 2)
1768  Mul_End(8)
1769 }
1770 
1771 void SSE2_Multiply32(word *C, const word *A, const word *B)
1772 {
1773  Mul_Begin(16)
1774  ASJ( jmp, 0, f)
1775  Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1776  AS1( ret) ASL(0)
1777  Mul_Column0(0, 2)
1778  Mul_Column1(1, 3)
1779  Mul_Column0(2, 4)
1780  Mul_Column1(3, 5)
1781  Mul_Column0(4, 6)
1782  Mul_Column1(5, 7)
1783  Mul_Column0(6, 8)
1784  Mul_Column1(7, 9)
1785  Mul_Column0(8, 10)
1786  Mul_Column1(9, 11)
1787  Mul_Column0(10, 12)
1788  Mul_Column1(11, 13)
1789  Mul_Column0(12, 14)
1790  Mul_Column1(13, 15)
1791  Mul_Column0(14, 16)
1792  Mul_Column1(15, 15)
1793  Mul_Column0(16, 14)
1794  Mul_Column1(17, 13)
1795  Mul_Column0(18, 12)
1796  Mul_Column1(19, 11)
1797  Mul_Column0(20, 10)
1798  Mul_Column1(21, 9)
1799  Mul_Column0(22, 8)
1800  Mul_Column1(23, 7)
1801  Mul_Column0(24, 6)
1802  Mul_Column1(25, 5)
1803  Mul_Column0(26, 4)
1804  Mul_Column1(27, 3)
1805  Mul_Column0(28, 2)
1806  Mul_End(16)
1807 }
1808 
1809 void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
1810 {
1811  Mul_Begin(2)
1812  Bot_SaveAcc(0) Bot_Acc(2)
1813  Bot_End(2)
1814 }
1815 
1816 void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
1817 {
1818  Mul_Begin(4)
1819 #ifndef __GNUC__
1820  ASJ( jmp, 0, f)
1821  Mul_Acc(3) Mul_Acc(2)
1822  AS1( ret) ASL(0)
1823 #endif
1824  Mul_Column0(0, 2)
1825  Mul_Column1(1, 3)
1826  Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1827  Bot_End(4)
1828 }
1829 
1830 void SSE2_MultiplyBottom16(word *C, const word *A, const word *B)
1831 {
1832  Mul_Begin(8)
1833 #ifndef __GNUC__
1834  ASJ( jmp, 0, f)
1835  Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1836  AS1( ret) ASL(0)
1837 #endif
1838  Mul_Column0(0, 2)
1839  Mul_Column1(1, 3)
1840  Mul_Column0(2, 4)
1841  Mul_Column1(3, 5)
1842  Mul_Column0(4, 6)
1843  Mul_Column1(5, 7)
1844  Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1845  Bot_End(8)
1846 }
1847 
1848 void SSE2_MultiplyBottom32(word *C, const word *A, const word *B)
1849 {
1850  Mul_Begin(16)
1851 #ifndef __GNUC__
1852  ASJ( jmp, 0, f)
1853  Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1854  AS1( ret) ASL(0)
1855 #endif
1856  Mul_Column0(0, 2)
1857  Mul_Column1(1, 3)
1858  Mul_Column0(2, 4)
1859  Mul_Column1(3, 5)
1860  Mul_Column0(4, 6)
1861  Mul_Column1(5, 7)
1862  Mul_Column0(6, 8)
1863  Mul_Column1(7, 9)
1864  Mul_Column0(8, 10)
1865  Mul_Column1(9, 11)
1866  Mul_Column0(10, 12)
1867  Mul_Column1(11, 13)
1868  Mul_Column0(12, 14)
1869  Mul_Column1(13, 15)
1870  Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
1871  Bot_End(16)
1872 }
1873 
1874 void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L)
1875 {
1876  Top_Begin(4)
1877  Top_Acc(3) Top_Acc(2) Top_Acc(1)
1878 #ifndef __GNUC__
1879  ASJ( jmp, 0, f)
1880  Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1881  AS1( ret) ASL(0)
1882 #endif
1883  Top_Column0(4)
1884  Top_Column1(3)
1885  Mul_Column0(0, 2)
1886  Top_End(2)
1887 }
1888 
1889 void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L)
1890 {
1891  Top_Begin(8)
1892  Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1893 #ifndef __GNUC__
1894  ASJ( jmp, 0, f)
1895  Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1896  AS1( ret) ASL(0)
1897 #endif
1898  Top_Column0(8)
1899  Top_Column1(7)
1900  Mul_Column0(0, 6)
1901  Mul_Column1(1, 5)
1902  Mul_Column0(2, 4)
1903  Mul_Column1(3, 3)
1904  Mul_Column0(4, 2)
1905  Top_End(4)
1906 }
1907 
1908 void SSE2_MultiplyTop32(word *C, const word *A, const word *B, word L)
1909 {
1910  Top_Begin(16)
1911  Top_Acc(15) Top_Acc(14) Top_Acc(13) Top_Acc(12) Top_Acc(11) Top_Acc(10) Top_Acc(9) Top_Acc(8) Top_Acc(7) Top_Acc(6) Top_Acc(5) Top_Acc(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)
1912 #ifndef __GNUC__
1913  ASJ( jmp, 0, f)
1914  Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
1915  AS1( ret) ASL(0)
1916 #endif
1917  Top_Column0(16)
1918  Top_Column1(15)
1919  Mul_Column0(0, 14)
1920  Mul_Column1(1, 13)
1921  Mul_Column0(2, 12)
1922  Mul_Column1(3, 11)
1923  Mul_Column0(4, 10)
1924  Mul_Column1(5, 9)
1925  Mul_Column0(6, 8)
1926  Mul_Column1(7, 7)
1927  Mul_Column0(8, 6)
1928  Mul_Column1(9, 5)
1929  Mul_Column0(10, 4)
1930  Mul_Column1(11, 3)
1931  Mul_Column0(12, 2)
1932  Top_End(8)
1933 }
1934 
1935 #endif // #if CRYPTOPP_INTEGER_SSE2
1936 
1937 // ********************************************************
1938 
1939 typedef int (CRYPTOPP_FASTCALL * PAdd)(size_t N, word *C, const word *A, const word *B);
1940 typedef void (* PMul)(word *C, const word *A, const word *B);
1941 typedef void (* PSqu)(word *C, const word *A);
1942 typedef void (* PMulTop)(word *C, const word *A, const word *B, word L);
1943 
1944 #if CRYPTOPP_INTEGER_SSE2
1945 static PAdd s_pAdd = &Baseline_Add, s_pSub = &Baseline_Sub;
1946 static size_t s_recursionLimit = 8;
1947 #else
1948 static const size_t s_recursionLimit = 16;
1949 #endif
1950 
1951 static PMul s_pMul[9], s_pBot[9];
1952 static PSqu s_pSqu[9];
1953 static PMulTop s_pTop[9];
1954 
1955 static void SetFunctionPointers()
1956 {
1957  s_pMul[0] = &Baseline_Multiply2;
1958  s_pBot[0] = &Baseline_MultiplyBottom2;
1959  s_pSqu[0] = &Baseline_Square2;
1960  s_pTop[0] = &Baseline_MultiplyTop2;
1961  s_pTop[1] = &Baseline_MultiplyTop4;
1962 
1963 #if CRYPTOPP_INTEGER_SSE2
1964  if (HasSSE2())
1965  {
1966 #if _MSC_VER != 1200 || defined(NDEBUG)
1967  if (IsP4())
1968  {
1969  s_pAdd = &SSE2_Add;
1970  s_pSub = &SSE2_Sub;
1971  }
1972 #endif
1973 
1974  s_recursionLimit = 32;
1975 
1976  s_pMul[1] = &SSE2_Multiply4;
1977  s_pMul[2] = &SSE2_Multiply8;
1978  s_pMul[4] = &SSE2_Multiply16;
1979  s_pMul[8] = &SSE2_Multiply32;
1980 
1981  s_pBot[1] = &SSE2_MultiplyBottom4;
1982  s_pBot[2] = &SSE2_MultiplyBottom8;
1983  s_pBot[4] = &SSE2_MultiplyBottom16;
1984  s_pBot[8] = &SSE2_MultiplyBottom32;
1985 
1986  s_pSqu[1] = &SSE2_Square4;
1987  s_pSqu[2] = &SSE2_Square8;
1988  s_pSqu[4] = &SSE2_Square16;
1989  s_pSqu[8] = &SSE2_Square32;
1990 
1991  s_pTop[2] = &SSE2_MultiplyTop8;
1992  s_pTop[4] = &SSE2_MultiplyTop16;
1993  s_pTop[8] = &SSE2_MultiplyTop32;
1994  }
1995  else
1996 #endif
1997  {
1998  s_pMul[1] = &Baseline_Multiply4;
1999  s_pMul[2] = &Baseline_Multiply8;
2000 
2001  s_pBot[1] = &Baseline_MultiplyBottom4;
2002  s_pBot[2] = &Baseline_MultiplyBottom8;
2003 
2004  s_pSqu[1] = &Baseline_Square4;
2005  s_pSqu[2] = &Baseline_Square8;
2006 
2007  s_pTop[2] = &Baseline_MultiplyTop8;
2008 
2009 #if !CRYPTOPP_INTEGER_SSE2
2010  s_pMul[4] = &Baseline_Multiply16;
2011  s_pBot[4] = &Baseline_MultiplyBottom16;
2012  s_pSqu[4] = &Baseline_Square16;
2013  s_pTop[4] = &Baseline_MultiplyTop16;
2014 #endif
2015  }
2016 }
2017 
2018 inline int Add(word *C, const word *A, const word *B, size_t N)
2019 {
2020 #if CRYPTOPP_INTEGER_SSE2
2021  return s_pAdd(N, C, A, B);
2022 #else
2023  return Baseline_Add(N, C, A, B);
2024 #endif
2025 }
2026 
2027 inline int Subtract(word *C, const word *A, const word *B, size_t N)
2028 {
2029 #if CRYPTOPP_INTEGER_SSE2
2030  return s_pSub(N, C, A, B);
2031 #else
2032  return Baseline_Sub(N, C, A, B);
2033 #endif
2034 }
2035 
2036 // ********************************************************
2037 
2038 
2039 #define A0 A
2040 #define A1 (A+N2)
2041 #define B0 B
2042 #define B1 (B+N2)
2043 
2044 #define T0 T
2045 #define T1 (T+N2)
2046 #define T2 (T+N)
2047 #define T3 (T+N+N2)
2048 
2049 #define R0 R
2050 #define R1 (R+N2)
2051 #define R2 (R+N)
2052 #define R3 (R+N+N2)
2053 
2054 // R[2*N] - result = A*B
2055 // T[2*N] - temporary work space
2056 // A[N] --- multiplier
2057 // B[N] --- multiplicant
2058 
2059 void RecursiveMultiply(word *R, word *T, const word *A, const word *B, size_t N)
2060 {
2061  assert(N>=2 && N%2==0);
2062 
2063  if (N <= s_recursionLimit)
2064  s_pMul[N/4](R, A, B);
2065  else
2066  {
2067  const size_t N2 = N/2;
2068 
2069  size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2;
2070  Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2071 
2072  size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2;
2073  Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2074 
2075  RecursiveMultiply(R2, T2, A1, B1, N2);
2076  RecursiveMultiply(T0, T2, R0, R1, N2);
2077  RecursiveMultiply(R0, T2, A0, B0, N2);
2078 
2079  // now T[01] holds (A1-A0)*(B0-B1), R[01] holds A0*B0, R[23] holds A1*B1
2080 
2081  int c2 = Add(R2, R2, R1, N2);
2082  int c3 = c2;
2083  c2 += Add(R1, R2, R0, N2);
2084  c3 += Add(R2, R2, R3, N2);
2085 
2086  if (AN2 == BN2)
2087  c3 -= Subtract(R1, R1, T0, N);
2088  else
2089  c3 += Add(R1, R1, T0, N);
2090 
2091  c3 += Increment(R2, N2, c2);
2092  assert (c3 >= 0 && c3 <= 2);
2093  Increment(R3, N2, c3);
2094  }
2095 }
2096 
2097 // R[2*N] - result = A*A
2098 // T[2*N] - temporary work space
2099 // A[N] --- number to be squared
2100 
2101 void RecursiveSquare(word *R, word *T, const word *A, size_t N)
2102 {
2103  assert(N && N%2==0);
2104 
2105  if (N <= s_recursionLimit)
2106  s_pSqu[N/4](R, A);
2107  else
2108  {
2109  const size_t N2 = N/2;
2110 
2111  RecursiveSquare(R0, T2, A0, N2);
2112  RecursiveSquare(R2, T2, A1, N2);
2113  RecursiveMultiply(T0, T2, A0, A1, N2);
2114 
2115  int carry = Add(R1, R1, T0, N);
2116  carry += Add(R1, R1, T0, N);
2117  Increment(R3, N2, carry);
2118  }
2119 }
2120 
2121 // R[N] - bottom half of A*B
2122 // T[3*N/2] - temporary work space
2123 // A[N] - multiplier
2124 // B[N] - multiplicant
2125 
2126 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
2127 {
2128  assert(N>=2 && N%2==0);
2129 
2130  if (N <= s_recursionLimit)
2131  s_pBot[N/4](R, A, B);
2132  else
2133  {
2134  const size_t N2 = N/2;
2135 
2136  RecursiveMultiply(R, T, A0, B0, N2);
2137  RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
2138  Add(R1, R1, T0, N2);
2139  RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
2140  Add(R1, R1, T0, N2);
2141  }
2142 }
2143 
2144 // R[N] --- upper half of A*B
2145 // T[2*N] - temporary work space
2146 // L[N] --- lower half of A*B
2147 // A[N] --- multiplier
2148 // B[N] --- multiplicant
2149 
2150 void MultiplyTop(word *R, word *T, const word *L, const word *A, const word *B, size_t N)
2151 {
2152  assert(N>=2 && N%2==0);
2153 
2154  if (N <= s_recursionLimit)
2155  s_pTop[N/4](R, A, B, L[N-1]);
2156  else
2157  {
2158  const size_t N2 = N/2;
2159 
2160  size_t AN2 = Compare(A0, A1, N2) > 0 ? 0 : N2;
2161  Subtract(R0, A + AN2, A + (N2 ^ AN2), N2);
2162 
2163  size_t BN2 = Compare(B0, B1, N2) > 0 ? 0 : N2;
2164  Subtract(R1, B + BN2, B + (N2 ^ BN2), N2);
2165 
2166  RecursiveMultiply(T0, T2, R0, R1, N2);
2167  RecursiveMultiply(R0, T2, A1, B1, N2);
2168 
2169  // now T[01] holds (A1-A0)*(B0-B1) = A1*B0+A0*B1-A1*B1-A0*B0, R[01] holds A1*B1
2170 
2171  int t, c3;
2172  int c2 = Subtract(T2, L+N2, L, N2);
2173 
2174  if (AN2 == BN2)
2175  {
2176  c2 -= Add(T2, T2, T0, N2);
2177  t = (Compare(T2, R0, N2) == -1);
2178  c3 = t - Subtract(T2, T2, T1, N2);
2179  }
2180  else
2181  {
2182  c2 += Subtract(T2, T2, T0, N2);
2183  t = (Compare(T2, R0, N2) == -1);
2184  c3 = t + Add(T2, T2, T1, N2);
2185  }
2186 
2187  c2 += t;
2188  if (c2 >= 0)
2189  c3 += Increment(T2, N2, c2);
2190  else
2191  c3 -= Decrement(T2, N2, -c2);
2192  c3 += Add(R0, T2, R1, N2);
2193 
2194  assert (c3 >= 0 && c3 <= 2);
2195  Increment(R1, N2, c3);
2196  }
2197 }
2198 
2199 inline void Multiply(word *R, word *T, const word *A, const word *B, size_t N)
2200 {
2201  RecursiveMultiply(R, T, A, B, N);
2202 }
2203 
2204 inline void Square(word *R, word *T, const word *A, size_t N)
2205 {
2206  RecursiveSquare(R, T, A, N);
2207 }
2208 
2209 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, size_t N)
2210 {
2211  RecursiveMultiplyBottom(R, T, A, B, N);
2212 }
2213 
2214 // R[NA+NB] - result = A*B
2215 // T[NA+NB] - temporary work space
2216 // A[NA] ---- multiplier
2217 // B[NB] ---- multiplicant
2218 
2219 void AsymmetricMultiply(word *R, word *T, const word *A, size_t NA, const word *B, size_t NB)
2220 {
2221  if (NA == NB)
2222  {
2223  if (A == B)
2224  Square(R, T, A, NA);
2225  else
2226  Multiply(R, T, A, B, NA);
2227 
2228  return;
2229  }
2230 
2231  if (NA > NB)
2232  {
2233  std::swap(A, B);
2234  std::swap(NA, NB);
2235  }
2236 
2237  assert(NB % NA == 0);
2238 
2239  if (NA==2 && !A[1])
2240  {
2241  switch (A[0])
2242  {
2243  case 0:
2244  SetWords(R, 0, NB+2);
2245  return;
2246  case 1:
2247  CopyWords(R, B, NB);
2248  R[NB] = R[NB+1] = 0;
2249  return;
2250  default:
2251  R[NB] = LinearMultiply(R, B, A[0], NB);
2252  R[NB+1] = 0;
2253  return;
2254  }
2255  }
2256 
2257  size_t i;
2258  if ((NB/NA)%2 == 0)
2259  {
2260  Multiply(R, T, A, B, NA);
2261  CopyWords(T+2*NA, R+NA, NA);
2262 
2263  for (i=2*NA; i<NB; i+=2*NA)
2264  Multiply(T+NA+i, T, A, B+i, NA);
2265  for (i=NA; i<NB; i+=2*NA)
2266  Multiply(R+i, T, A, B+i, NA);
2267  }
2268  else
2269  {
2270  for (i=0; i<NB; i+=2*NA)
2271  Multiply(R+i, T, A, B+i, NA);
2272  for (i=NA; i<NB; i+=2*NA)
2273  Multiply(T+NA+i, T, A, B+i, NA);
2274  }
2275 
2276  if (Add(R+NA, R+NA, T+2*NA, NB-NA))
2277  Increment(R+NB, NA);
2278 }
2279 
2280 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
2281 // T[3*N/2] - temporary work space
2282 // A[N] ----- an odd number as input
2283 
2284 void RecursiveInverseModPower2(word *R, word *T, const word *A, size_t N)
2285 {
2286  if (N==2)
2287  {
2288  T[0] = AtomicInverseModPower2(A[0]);
2289  T[1] = 0;
2290  s_pBot[0](T+2, T, A);
2291  TwosComplement(T+2, 2);
2292  Increment(T+2, 2, 2);
2293  s_pBot[0](R, T, T+2);
2294  }
2295  else
2296  {
2297  const size_t N2 = N/2;
2298  RecursiveInverseModPower2(R0, T0, A0, N2);
2299  T0[0] = 1;
2300  SetWords(T0+1, 0, N2-1);
2301  MultiplyTop(R1, T1, T0, R0, A0, N2);
2302  MultiplyBottom(T0, T1, R0, A1, N2);
2303  Add(T0, R1, T0, N2);
2304  TwosComplement(T0, N2);
2305  MultiplyBottom(R1, T1, R0, T0, N2);
2306  }
2307 }
2308 
2309 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
2310 // T[3*N] - temporary work space
2311 // X[2*N] - number to be reduced
2312 // M[N] --- modulus
2313 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
2314 
2315 void MontgomeryReduce(word *R, word *T, word *X, const word *M, const word *U, size_t N)
2316 {
2317 #if 1
2318  MultiplyBottom(R, T, X, U, N);
2319  MultiplyTop(T, T+N, X, R, M, N);
2320  word borrow = Subtract(T, X+N, T, N);
2321  // defend against timing attack by doing this Add even when not needed
2322  word carry = Add(T+N, T, M, N);
2323  assert(carry | !borrow);
2324  CopyWords(R, T + ((0-borrow) & N), N);
2325 #elif 0
2326  const word u = 0-U[0];
2327  Declare2Words(p)
2328  for (size_t i=0; i<N; i++)
2329  {
2330  const word t = u * X[i];
2331  word c = 0;
2332  for (size_t j=0; j<N; j+=2)
2333  {
2334  MultiplyWords(p, t, M[j]);
2335  Acc2WordsBy1(p, X[i+j]);
2336  Acc2WordsBy1(p, c);
2337  X[i+j] = LowWord(p);
2338  c = HighWord(p);
2339  MultiplyWords(p, t, M[j+1]);
2340  Acc2WordsBy1(p, X[i+j+1]);
2341  Acc2WordsBy1(p, c);
2342  X[i+j+1] = LowWord(p);
2343  c = HighWord(p);
2344  }
2345 
2346  if (Increment(X+N+i, N-i, c))
2347  while (!Subtract(X+N, X+N, M, N)) {}
2348  }
2349 
2350  memcpy(R, X+N, N*WORD_SIZE);
2351 #else
2352  __m64 u = _mm_cvtsi32_si64(0-U[0]), p;
2353  for (size_t i=0; i<N; i++)
2354  {
2355  __m64 t = _mm_cvtsi32_si64(X[i]);
2356  t = _mm_mul_su32(t, u);
2357  __m64 c = _mm_setzero_si64();
2358  for (size_t j=0; j<N; j+=2)
2359  {
2360  p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j]));
2361  p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j]));
2362  c = _mm_add_si64(c, p);
2363  X[i+j] = _mm_cvtsi64_si32(c);
2364  c = _mm_srli_si64(c, 32);
2365  p = _mm_mul_su32(t, _mm_cvtsi32_si64(M[j+1]));
2366  p = _mm_add_si64(p, _mm_cvtsi32_si64(X[i+j+1]));
2367  c = _mm_add_si64(c, p);
2368  X[i+j+1] = _mm_cvtsi64_si32(c);
2369  c = _mm_srli_si64(c, 32);
2370  }
2371 
2372  if (Increment(X+N+i, N-i, _mm_cvtsi64_si32(c)))
2373  while (!Subtract(X+N, X+N, M, N)) {}
2374  }
2375 
2376  memcpy(R, X+N, N*WORD_SIZE);
2377  _mm_empty();
2378 #endif
2379 }
2380 
2381 // R[N] --- result = X/(2**(WORD_BITS*N/2)) mod M
2382 // T[2*N] - temporary work space
2383 // X[2*N] - number to be reduced
2384 // M[N] --- modulus
2385 // U[N/2] - multiplicative inverse of M mod 2**(WORD_BITS*N/2)
2386 // V[N] --- 2**(WORD_BITS*3*N/2) mod M
2387 
2388 void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const word *U, const word *V, size_t N)
2389 {
2390  assert(N%2==0 && N>=4);
2391 
2392 #define M0 M
2393 #define M1 (M+N2)
2394 #define V0 V
2395 #define V1 (V+N2)
2396 
2397 #define X0 X
2398 #define X1 (X+N2)
2399 #define X2 (X+N)
2400 #define X3 (X+N+N2)
2401 
2402  const size_t N2 = N/2;
2403  Multiply(T0, T2, V0, X3, N2);
2404  int c2 = Add(T0, T0, X0, N);
2405  MultiplyBottom(T3, T2, T0, U, N2);
2406  MultiplyTop(T2, R, T0, T3, M0, N2);
2407  c2 -= Subtract(T2, T1, T2, N2);
2408  Multiply(T0, R, T3, M1, N2);
2409  c2 -= Subtract(T0, T2, T0, N2);
2410  int c3 = -(int)Subtract(T1, X2, T1, N2);
2411  Multiply(R0, T2, V1, X3, N2);
2412  c3 += Add(R, R, T, N);
2413 
2414  if (c2>0)
2415  c3 += Increment(R1, N2);
2416  else if (c2<0)
2417  c3 -= Decrement(R1, N2, -c2);
2418 
2419  assert(c3>=-1 && c3<=1);
2420  if (c3>0)
2421  Subtract(R, R, M, N);
2422  else if (c3<0)
2423  Add(R, R, M, N);
2424 
2425 #undef M0
2426 #undef M1
2427 #undef V0
2428 #undef V1
2429 
2430 #undef X0
2431 #undef X1
2432 #undef X2
2433 #undef X3
2434 }
2435 
2436 #undef A0
2437 #undef A1
2438 #undef B0
2439 #undef B1
2440 
2441 #undef T0
2442 #undef T1
2443 #undef T2
2444 #undef T3
2445 
2446 #undef R0
2447 #undef R1
2448 #undef R2
2449 #undef R3
2450 
2451 /*
2452 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
2453 static word SubatomicDivide(word *A, word B0, word B1)
2454 {
2455  // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
2456  assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
2457 
2458  // estimate the quotient: do a 2 word by 1 word divide
2459  word Q;
2460  if (B1+1 == 0)
2461  Q = A[2];
2462  else
2463  Q = DWord(A[1], A[2]).DividedBy(B1+1);
2464 
2465  // now subtract Q*B from A
2466  DWord p = DWord::Multiply(B0, Q);
2467  DWord u = (DWord) A[0] - p.GetLowHalf();
2468  A[0] = u.GetLowHalf();
2469  u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q);
2470  A[1] = u.GetLowHalf();
2471  A[2] += u.GetHighHalf();
2472 
2473  // Q <= actual quotient, so fix it
2474  while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
2475  {
2476  u = (DWord) A[0] - B0;
2477  A[0] = u.GetLowHalf();
2478  u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow();
2479  A[1] = u.GetLowHalf();
2480  A[2] += u.GetHighHalf();
2481  Q++;
2482  assert(Q); // shouldn't overflow
2483  }
2484 
2485  return Q;
2486 }
2487 
2488 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
2489 static inline void AtomicDivide(word *Q, const word *A, const word *B)
2490 {
2491  if (!B[0] && !B[1]) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
2492  {
2493  Q[0] = A[2];
2494  Q[1] = A[3];
2495  }
2496  else
2497  {
2498  word T[4];
2499  T[0] = A[0]; T[1] = A[1]; T[2] = A[2]; T[3] = A[3];
2500  Q[1] = SubatomicDivide(T+1, B[0], B[1]);
2501  Q[0] = SubatomicDivide(T, B[0], B[1]);
2502 
2503 #ifndef NDEBUG
2504  // multiply quotient and divisor and add remainder, make sure it equals dividend
2505  assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2506  word P[4];
2507  LowLevel::Multiply2(P, Q, B);
2508  Add(P, P, T, 4);
2509  assert(memcmp(P, A, 4*WORD_SIZE)==0);
2510 #endif
2511  }
2512 }
2513 */
2514 
2515 static inline void AtomicDivide(word *Q, const word *A, const word *B)
2516 {
2517  word T[4];
2518  DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
2519  Q[0] = q.GetLowHalf();
2520  Q[1] = q.GetHighHalf();
2521 
2522 #ifndef NDEBUG
2523  if (B[0] || B[1])
2524  {
2525  // multiply quotient and divisor and add remainder, make sure it equals dividend
2526  assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
2527  word P[4];
2528  s_pMul[0](P, Q, B);
2529  Add(P, P, T, 4);
2530  assert(memcmp(P, A, 4*WORD_SIZE)==0);
2531  }
2532 #endif
2533 }
2534 
2535 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
2536 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, size_t N)
2537 {
2538  assert(N && N%2==0);
2539 
2540  AsymmetricMultiply(T, T+N+2, Q, 2, B, N);
2541 
2542  word borrow = Subtract(R, R, T, N+2);
2543  assert(!borrow && !R[N+1]);
2544 
2545  while (R[N] || Compare(R, B, N) >= 0)
2546  {
2547  R[N] -= Subtract(R, R, B, N);
2548  Q[1] += (++Q[0]==0);
2549  assert(Q[0] || Q[1]); // no overflow
2550  }
2551 }
2552 
2553 // R[NB] -------- remainder = A%B
2554 // Q[NA-NB+2] --- quotient = A/B
2555 // T[NA+3*(NB+2)] - temp work space
2556 // A[NA] -------- dividend
2557 // B[NB] -------- divisor
2558 
2559 void Divide(word *R, word *Q, word *T, const word *A, size_t NA, const word *B, size_t NB)
2560 {
2561  assert(NA && NB && NA%2==0 && NB%2==0);
2562  assert(B[NB-1] || B[NB-2]);
2563  assert(NB <= NA);
2564 
2565  // set up temporary work space
2566  word *const TA=T;
2567  word *const TB=T+NA+2;
2568  word *const TP=T+NA+2+NB;
2569 
2570  // copy B into TB and normalize it so that TB has highest bit set to 1
2571  unsigned shiftWords = (B[NB-1]==0);
2572  TB[0] = TB[NB-1] = 0;
2573  CopyWords(TB+shiftWords, B, NB-shiftWords);
2574  unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
2575  assert(shiftBits < WORD_BITS);
2576  ShiftWordsLeftByBits(TB, NB, shiftBits);
2577 
2578  // copy A into TA and normalize it
2579  TA[0] = TA[NA] = TA[NA+1] = 0;
2580  CopyWords(TA+shiftWords, A, NA);
2581  ShiftWordsLeftByBits(TA, NA+2, shiftBits);
2582 
2583  if (TA[NA+1]==0 && TA[NA] <= 1)
2584  {
2585  Q[NA-NB+1] = Q[NA-NB] = 0;
2586  while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
2587  {
2588  TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
2589  ++Q[NA-NB];
2590  }
2591  }
2592  else
2593  {
2594  NA+=2;
2595  assert(Compare(TA+NA-NB, TB, NB) < 0);
2596  }
2597 
2598  word BT[2];
2599  BT[0] = TB[NB-2] + 1;
2600  BT[1] = TB[NB-1] + (BT[0]==0);
2601 
2602  // start reducing TA mod TB, 2 words at a time
2603  for (size_t i=NA-2; i>=NB; i-=2)
2604  {
2605  AtomicDivide(Q+i-NB, TA+i-2, BT);
2606  CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
2607  }
2608 
2609  // copy TA into R, and denormalize it
2610  CopyWords(R, TA+shiftWords, NB);
2611  ShiftWordsRightByBits(R, NB, shiftBits);
2612 }
2613 
2614 static inline size_t EvenWordCount(const word *X, size_t N)
2615 {
2616  while (N && X[N-2]==0 && X[N-1]==0)
2617  N-=2;
2618  return N;
2619 }
2620 
2621 // return k
2622 // R[N] --- result = A^(-1) * 2^k mod M
2623 // T[4*N] - temporary work space
2624 // A[NA] -- number to take inverse of
2625 // M[N] --- modulus
2626 
2627 unsigned int AlmostInverse(word *R, word *T, const word *A, size_t NA, const word *M, size_t N)
2628 {
2629  assert(NA<=N && N && N%2==0);
2630 
2631  word *b = T;
2632  word *c = T+N;
2633  word *f = T+2*N;
2634  word *g = T+3*N;
2635  size_t bcLen=2, fgLen=EvenWordCount(M, N);
2636  unsigned int k=0;
2637  bool s=false;
2638 
2639  SetWords(T, 0, 3*N);
2640  b[0]=1;
2641  CopyWords(f, A, NA);
2642  CopyWords(g, M, N);
2643 
2644  while (1)
2645  {
2646  word t=f[0];
2647  while (!t)
2648  {
2649  if (EvenWordCount(f, fgLen)==0)
2650  {
2651  SetWords(R, 0, N);
2652  return 0;
2653  }
2654 
2655  ShiftWordsRightByWords(f, fgLen, 1);
2656  bcLen += 2 * (c[bcLen-1] != 0);
2657  assert(bcLen <= N);
2658  ShiftWordsLeftByWords(c, bcLen, 1);
2659  k+=WORD_BITS;
2660  t=f[0];
2661  }
2662 
2663  unsigned int i = TrailingZeros(t);
2664  t >>= i;
2665  k += i;
2666 
2667  if (t==1 && f[1]==0 && EvenWordCount(f+2, fgLen-2)==0)
2668  {
2669  if (s)
2670  Subtract(R, M, b, N);
2671  else
2672  CopyWords(R, b, N);
2673  return k;
2674  }
2675 
2676  ShiftWordsRightByBits(f, fgLen, i);
2677  t = ShiftWordsLeftByBits(c, bcLen, i);
2678  c[bcLen] += t;
2679  bcLen += 2 * (t!=0);
2680  assert(bcLen <= N);
2681 
2682  bool swap = Compare(f, g, fgLen)==-1;
2683  ConditionalSwapPointers(swap, f, g);
2684  ConditionalSwapPointers(swap, b, c);
2685  s ^= swap;
2686 
2687  fgLen -= 2 * !(f[fgLen-2] | f[fgLen-1]);
2688 
2689  Subtract(f, f, g, fgLen);
2690  t = Add(b, b, c, bcLen);
2691  b[bcLen] += t;
2692  bcLen += 2*t;
2693  assert(bcLen <= N);
2694  }
2695 }
2696 
2697 // R[N] - result = A/(2^k) mod M
2698 // A[N] - input
2699 // M[N] - modulus
2700 
2701 void DivideByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
2702 {
2703  CopyWords(R, A, N);
2704 
2705  while (k--)
2706  {
2707  if (R[0]%2==0)
2708  ShiftWordsRightByBits(R, N, 1);
2709  else
2710  {
2711  word carry = Add(R, R, M, N);
2712  ShiftWordsRightByBits(R, N, 1);
2713  R[N-1] += carry<<(WORD_BITS-1);
2714  }
2715  }
2716 }
2717 
2718 // R[N] - result = A*(2^k) mod M
2719 // A[N] - input
2720 // M[N] - modulus
2721 
2722 void MultiplyByPower2Mod(word *R, const word *A, size_t k, const word *M, size_t N)
2723 {
2724  CopyWords(R, A, N);
2725 
2726  while (k--)
2727  if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
2728  Subtract(R, R, M, N);
2729 }
2730 
2731 // ******************************************************************
2732 
2733 InitializeInteger::InitializeInteger()
2734 {
2735  if (!g_pAssignIntToInteger)
2736  {
2737  SetFunctionPointers();
2738  g_pAssignIntToInteger = AssignIntToInteger;
2739  }
2740 }
2741 
2742 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
2743 
2744 static inline size_t RoundupSize(size_t n)
2745 {
2746  if (n<=8)
2747  return RoundupSizeTable[n];
2748  else if (n<=16)
2749  return 16;
2750  else if (n<=32)
2751  return 32;
2752  else if (n<=64)
2753  return 64;
2754  else return size_t(1) << BitPrecision(n-1);
2755 }
2756 
2758  : reg(2), sign(POSITIVE)
2759 {
2760  reg[0] = reg[1] = 0;
2761 }
2762 
2764  : reg(RoundupSize(t.WordCount())), sign(t.sign)
2765 {
2766  CopyWords(reg, t.reg, reg.size());
2767 }
2768 
2769 Integer::Integer(Sign s, lword value)
2770  : reg(2), sign(s)
2771 {
2772  reg[0] = word(value);
2773  reg[1] = word(SafeRightShift<WORD_BITS>(value));
2774 }
2775 
2776 Integer::Integer(signed long value)
2777  : reg(2)
2778 {
2779  if (value >= 0)
2780  sign = POSITIVE;
2781  else
2782  {
2783  sign = NEGATIVE;
2784  value = -value;
2785  }
2786  reg[0] = word(value);
2787  reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value));
2788 }
2789 
2790 Integer::Integer(Sign s, word high, word low)
2791  : reg(2), sign(s)
2792 {
2793  reg[0] = low;
2794  reg[1] = high;
2795 }
2796 
2798 {
2799  if (ByteCount() > sizeof(long))
2800  return false;
2801 
2802  unsigned long value = (unsigned long)reg[0];
2803  value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
2804 
2805  if (sign==POSITIVE)
2806  return (signed long)value >= 0;
2807  else
2808  return -(signed long)value < 0;
2809 }
2810 
2811 signed long Integer::ConvertToLong() const
2812 {
2813  assert(IsConvertableToLong());
2814 
2815  unsigned long value = (unsigned long)reg[0];
2816  value += SafeLeftShift<WORD_BITS, unsigned long>((unsigned long)reg[1]);
2817  return sign==POSITIVE ? value : -(signed long)value;
2818 }
2819 
2820 Integer::Integer(BufferedTransformation &encodedInteger, size_t byteCount, Signedness s)
2821 {
2822  Decode(encodedInteger, byteCount, s);
2823 }
2824 
2825 Integer::Integer(const byte *encodedInteger, size_t byteCount, Signedness s)
2826 {
2827  Decode(encodedInteger, byteCount, s);
2828 }
2829 
2831 {
2832  BERDecode(bt);
2833 }
2834 
2836 {
2837  Randomize(rng, bitcount);
2838 }
2839 
2840 Integer::Integer(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
2841 {
2842  if (!Randomize(rng, min, max, rnType, equiv, mod))
2844 }
2845 
2847 {
2848  Integer r((word)0, BitsToWords(e+1));
2849  r.SetBit(e);
2850  return r;
2851 }
2852 
2853 template <long i>
2855 {
2856  Integer * operator()() const
2857  {
2858  return new Integer(i);
2859  }
2860 };
2861 
2863 {
2864  return Singleton<Integer>().Ref();
2865 }
2866 
2868 {
2869  return Singleton<Integer, NewInteger<1> >().Ref();
2870 }
2871 
2873 {
2874  return Singleton<Integer, NewInteger<2> >().Ref();
2875 }
2876 
2877 bool Integer::operator!() const
2878 {
2879  return IsNegative() ? false : (reg[0]==0 && WordCount()==0);
2880 }
2881 
2882 Integer& Integer::operator=(const Integer& t)
2883 {
2884  if (this != &t)
2885  {
2886  if (reg.size() != t.reg.size() || t.reg[t.reg.size()/2] == 0)
2887  reg.New(RoundupSize(t.WordCount()));
2888  CopyWords(reg, t.reg, reg.size());
2889  sign = t.sign;
2890  }
2891  return *this;
2892 }
2893 
2894 bool Integer::GetBit(size_t n) const
2895 {
2896  if (n/WORD_BITS >= reg.size())
2897  return 0;
2898  else
2899  return bool((reg[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
2900 }
2901 
2902 void Integer::SetBit(size_t n, bool value)
2903 {
2904  if (value)
2905  {
2906  reg.CleanGrow(RoundupSize(BitsToWords(n+1)));
2907  reg[n/WORD_BITS] |= (word(1) << (n%WORD_BITS));
2908  }
2909  else
2910  {
2911  if (n/WORD_BITS < reg.size())
2912  reg[n/WORD_BITS] &= ~(word(1) << (n%WORD_BITS));
2913  }
2914 }
2915 
2916 byte Integer::GetByte(size_t n) const
2917 {
2918  if (n/WORD_SIZE >= reg.size())
2919  return 0;
2920  else
2921  return byte(reg[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
2922 }
2923 
2924 void Integer::SetByte(size_t n, byte value)
2925 {
2926  reg.CleanGrow(RoundupSize(BytesToWords(n+1)));
2927  reg[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
2928  reg[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
2929 }
2930 
2931 lword Integer::GetBits(size_t i, size_t n) const
2932 {
2933  lword v = 0;
2934  assert(n <= sizeof(v)*8);
2935  for (unsigned int j=0; j<n; j++)
2936  v |= lword(GetBit(i+j)) << j;
2937  return v;
2938 }
2939 
2940 Integer Integer::operator-() const
2941 {
2942  Integer result(*this);
2943  result.Negate();
2944  return result;
2945 }
2946 
2947 Integer Integer::AbsoluteValue() const
2948 {
2949  Integer result(*this);
2950  result.sign = POSITIVE;
2951  return result;
2952 }
2953 
2954 void Integer::swap(Integer &a)
2955 {
2956  reg.swap(a.reg);
2957  std::swap(sign, a.sign);
2958 }
2959 
2960 Integer::Integer(word value, size_t length)
2961  : reg(RoundupSize(length)), sign(POSITIVE)
2962 {
2963  reg[0] = value;
2964  SetWords(reg+1, 0, reg.size()-1);
2965 }
2966 
2967 template <class T>
2968 static Integer StringToInteger(const T *str)
2969 {
2970  int radix;
2971  // GCC workaround
2972  // std::char_traits<wchar_t>::length() not defined in GCC 3.2 and STLport 4.5.3
2973  unsigned int length;
2974  for (length = 0; str[length] != 0; length++) {}
2975 
2976  Integer v;
2977 
2978  if (length == 0)
2979  return v;
2980 
2981  switch (str[length-1])
2982  {
2983  case 'h':
2984  case 'H':
2985  radix=16;
2986  break;
2987  case 'o':
2988  case 'O':
2989  radix=8;
2990  break;
2991  case 'b':
2992  case 'B':
2993  radix=2;
2994  break;
2995  default:
2996  radix=10;
2997  }
2998 
2999  if (length > 2 && str[0] == '0' && str[1] == 'x')
3000  radix = 16;
3001 
3002  for (unsigned i=0; i<length; i++)
3003  {
3004  int digit;
3005 
3006  if (str[i] >= '0' && str[i] <= '9')
3007  digit = str[i] - '0';
3008  else if (str[i] >= 'A' && str[i] <= 'F')
3009  digit = str[i] - 'A' + 10;
3010  else if (str[i] >= 'a' && str[i] <= 'f')
3011  digit = str[i] - 'a' + 10;
3012  else
3013  digit = radix;
3014 
3015  if (digit < radix)
3016  {
3017  v *= radix;
3018  v += digit;
3019  }
3020  }
3021 
3022  if (str[0] == '-')
3023  v.Negate();
3024 
3025  return v;
3026 }
3027 
3028 Integer::Integer(const char *str)
3029  : reg(2), sign(POSITIVE)
3030 {
3031  *this = StringToInteger(str);
3032 }
3033 
3034 Integer::Integer(const wchar_t *str)
3035  : reg(2), sign(POSITIVE)
3036 {
3037  *this = StringToInteger(str);
3038 }
3039 
3040 unsigned int Integer::WordCount() const
3041 {
3042  return (unsigned int)CountWords(reg, reg.size());
3043 }
3044 
3045 unsigned int Integer::ByteCount() const
3046 {
3047  unsigned wordCount = WordCount();
3048  if (wordCount)
3049  return (wordCount-1)*WORD_SIZE + BytePrecision(reg[wordCount-1]);
3050  else
3051  return 0;
3052 }
3053 
3054 unsigned int Integer::BitCount() const
3055 {
3056  unsigned wordCount = WordCount();
3057  if (wordCount)
3058  return (wordCount-1)*WORD_BITS + BitPrecision(reg[wordCount-1]);
3059  else
3060  return 0;
3061 }
3062 
3063 void Integer::Decode(const byte *input, size_t inputLen, Signedness s)
3064 {
3065  StringStore store(input, inputLen);
3066  Decode(store, inputLen, s);
3067 }
3068 
3069 void Integer::Decode(BufferedTransformation &bt, size_t inputLen, Signedness s)
3070 {
3071  assert(bt.MaxRetrievable() >= inputLen);
3072 
3073  byte b;
3074  bt.Peek(b);
3075  sign = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
3076 
3077  while (inputLen>0 && (sign==POSITIVE ? b==0 : b==0xff))
3078  {
3079  bt.Skip(1);
3080  inputLen--;
3081  bt.Peek(b);
3082  }
3083 
3084  reg.CleanNew(RoundupSize(BytesToWords(inputLen)));
3085 
3086  for (size_t i=inputLen; i > 0; i--)
3087  {
3088  bt.Get(b);
3089  reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;
3090  }
3091 
3092  if (sign == NEGATIVE)
3093  {
3094  for (size_t i=inputLen; i<reg.size()*WORD_SIZE; i++)
3095  reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8;
3096  TwosComplement(reg, reg.size());
3097  }
3098 }
3099 
3100 size_t Integer::MinEncodedSize(Signedness signedness) const
3101 {
3102  unsigned int outputLen = STDMAX(1U, ByteCount());
3103  if (signedness == UNSIGNED)
3104  return outputLen;
3105  if (NotNegative() && (GetByte(outputLen-1) & 0x80))
3106  outputLen++;
3107  if (IsNegative() && *this < -Power2(outputLen*8-1))
3108  outputLen++;
3109  return outputLen;
3110 }
3111 
3112 void Integer::Encode(byte *output, size_t outputLen, Signedness signedness) const
3113 {
3114  ArraySink sink(output, outputLen);
3115  Encode(sink, outputLen, signedness);
3116 }
3117 
3118 void Integer::Encode(BufferedTransformation &bt, size_t outputLen, Signedness signedness) const
3119 {
3120  if (signedness == UNSIGNED || NotNegative())
3121  {
3122  for (size_t i=outputLen; i > 0; i--)
3123  bt.Put(GetByte(i-1));
3124  }
3125  else
3126  {
3127  // take two's complement of *this
3128  Integer temp = Integer::Power2(8*STDMAX((size_t)ByteCount(), outputLen)) + *this;
3129  temp.Encode(bt, outputLen, UNSIGNED);
3130  }
3131 }
3132 
3134 {
3135  DERGeneralEncoder enc(bt, INTEGER);
3136  Encode(enc, MinEncodedSize(SIGNED), SIGNED);
3137  enc.MessageEnd();
3138 }
3139 
3140 void Integer::BERDecode(const byte *input, size_t len)
3141 {
3142  StringStore store(input, len);
3143  BERDecode(store);
3144 }
3145 
3146 void Integer::BERDecode(BufferedTransformation &bt)
3147 {
3148  BERGeneralDecoder dec(bt, INTEGER);
3149  if (!dec.IsDefiniteLength() || dec.MaxRetrievable() < dec.RemainingLength())
3150  BERDecodeError();
3151  Decode(dec, (size_t)dec.RemainingLength(), SIGNED);
3152  dec.MessageEnd();
3153 }
3154 
3156 {
3157  DERGeneralEncoder enc(bt, OCTET_STRING);
3158  Encode(enc, length);
3159  enc.MessageEnd();
3160 }
3161 
3163 {
3164  BERGeneralDecoder dec(bt, OCTET_STRING);
3165  if (!dec.IsDefiniteLength() || dec.RemainingLength() != length)
3166  BERDecodeError();
3167  Decode(dec, length);
3168  dec.MessageEnd();
3169 }
3170 
3171 size_t Integer::OpenPGPEncode(byte *output, size_t len) const
3172 {
3173  ArraySink sink(output, len);
3174  return OpenPGPEncode(sink);
3175 }
3176 
3178 {
3179  word16 bitCount = BitCount();
3180  bt.PutWord16(bitCount);
3181  size_t byteCount = BitsToBytes(bitCount);
3182  Encode(bt, byteCount);
3183  return 2 + byteCount;
3184 }
3185 
3186 void Integer::OpenPGPDecode(const byte *input, size_t len)
3187 {
3188  StringStore store(input, len);
3189  OpenPGPDecode(store);
3190 }
3191 
3192 void Integer::OpenPGPDecode(BufferedTransformation &bt)
3193 {
3194  word16 bitCount;
3195  if (bt.GetWord16(bitCount) != 2 || bt.MaxRetrievable() < BitsToBytes(bitCount))
3196  throw OpenPGPDecodeErr();
3197  Decode(bt, BitsToBytes(bitCount));
3198 }
3199 
3200 void Integer::Randomize(RandomNumberGenerator &rng, size_t nbits)
3201 {
3202  const size_t nbytes = nbits/8 + 1;
3203  SecByteBlock buf(nbytes);
3204  rng.GenerateBlock(buf, nbytes);
3205  if (nbytes)
3206  buf[0] = (byte)Crop(buf[0], nbits % 8);
3207  Decode(buf, nbytes, UNSIGNED);
3208 }
3209 
3210 void Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max)
3211 {
3212  if (min > max)
3213  throw InvalidArgument("Integer: Min must be no greater than Max");
3214 
3215  Integer range = max - min;
3216  const unsigned int nbits = range.BitCount();
3217 
3218  do
3219  {
3220  Randomize(rng, nbits);
3221  }
3222  while (*this > range);
3223 
3224  *this += min;
3225 }
3226 
3227 bool Integer::Randomize(RandomNumberGenerator &rng, const Integer &min, const Integer &max, RandomNumberType rnType, const Integer &equiv, const Integer &mod)
3228 {
3229  return GenerateRandomNoThrow(rng, MakeParameters("Min", min)("Max", max)("RandomNumberType", rnType)("EquivalentTo", equiv)("Mod", mod));
3230 }
3231 
3233 {
3234 public:
3235  KDF2_RNG(const byte *seed, size_t seedSize)
3236  : m_counter(0), m_counterAndSeed(seedSize + 4)
3237  {
3238  memcpy(m_counterAndSeed + 4, seed, seedSize);
3239  }
3240 
3241  void GenerateBlock(byte *output, size_t size)
3242  {
3243  PutWord(false, BIG_ENDIAN_ORDER, m_counterAndSeed, m_counter);
3244  ++m_counter;
3245  P1363_KDF2<SHA1>::DeriveKey(output, size, m_counterAndSeed, m_counterAndSeed.size(), NULL, 0);
3246  }
3247 
3248 private:
3249  word32 m_counter;
3250  SecByteBlock m_counterAndSeed;
3251 };
3252 
3253 bool Integer::GenerateRandomNoThrow(RandomNumberGenerator &i_rng, const NameValuePairs &params)
3254 {
3255  Integer min = params.GetValueWithDefault("Min", Integer::Zero());
3256  Integer max;
3257  if (!params.GetValue("Max", max))
3258  {
3259  int bitLength;
3260  if (params.GetIntValue("BitLength", bitLength))
3261  max = Integer::Power2(bitLength);
3262  else
3263  throw InvalidArgument("Integer: missing Max argument");
3264  }
3265  if (min > max)
3266  throw InvalidArgument("Integer: Min must be no greater than Max");
3267 
3268  Integer equiv = params.GetValueWithDefault("EquivalentTo", Integer::Zero());
3269  Integer mod = params.GetValueWithDefault("Mod", Integer::One());
3270 
3271  if (equiv.IsNegative() || equiv >= mod)
3272  throw InvalidArgument("Integer: invalid EquivalentTo and/or Mod argument");
3273 
3274  Integer::RandomNumberType rnType = params.GetValueWithDefault("RandomNumberType", Integer::ANY);
3275 
3276  member_ptr<KDF2_RNG> kdf2Rng;
3278  if (params.GetValue(Name::Seed(), seed))
3279  {
3280  ByteQueue bq;
3281  DERSequenceEncoder seq(bq);
3282  min.DEREncode(seq);
3283  max.DEREncode(seq);
3284  equiv.DEREncode(seq);
3285  mod.DEREncode(seq);
3286  DEREncodeUnsigned(seq, rnType);
3287  DEREncodeOctetString(seq, seed.begin(), seed.size());
3288  seq.MessageEnd();
3289 
3290  SecByteBlock finalSeed((size_t)bq.MaxRetrievable());
3291  bq.Get(finalSeed, finalSeed.size());
3292  kdf2Rng.reset(new KDF2_RNG(finalSeed.begin(), finalSeed.size()));
3293  }
3294  RandomNumberGenerator &rng = kdf2Rng.get() ? (RandomNumberGenerator &)*kdf2Rng : i_rng;
3295 
3296  switch (rnType)
3297  {
3298  case ANY:
3299  if (mod == One())
3300  Randomize(rng, min, max);
3301  else
3302  {
3303  Integer min1 = min + (equiv-min)%mod;
3304  if (max < min1)
3305  return false;
3306  Randomize(rng, Zero(), (max - min1) / mod);
3307  *this *= mod;
3308  *this += min1;
3309  }
3310  return true;
3311 
3312  case PRIME:
3313  {
3314  const PrimeSelector *pSelector = params.GetValueWithDefault(Name::PointerToPrimeSelector(), (const PrimeSelector *)NULL);
3315 
3316  int i;
3317  i = 0;
3318  while (1)
3319  {
3320  if (++i==16)
3321  {
3322  // check if there are any suitable primes in [min, max]
3323  Integer first = min;
3324  if (FirstPrime(first, max, equiv, mod, pSelector))
3325  {
3326  // if there is only one suitable prime, we're done
3327  *this = first;
3328  if (!FirstPrime(first, max, equiv, mod, pSelector))
3329  return true;
3330  }
3331  else
3332  return false;
3333  }
3334 
3335  Randomize(rng, min, max);
3336  if (FirstPrime(*this, STDMIN(*this+mod*PrimeSearchInterval(max), max), equiv, mod, pSelector))
3337  return true;
3338  }
3339  }
3340 
3341  default:
3342  throw InvalidArgument("Integer: invalid RandomNumberType argument");
3343  }
3344 }
3345 
3346 std::istream& operator>>(std::istream& in, Integer &a)
3347 {
3348  char c;
3349  unsigned int length = 0;
3350  SecBlock<char> str(length + 16);
3351 
3352  std::ws(in);
3353 
3354  do
3355  {
3356  in.read(&c, 1);
3357  str[length++] = c;
3358  if (length >= str.size())
3359  str.Grow(length + 16);
3360  }
3361  while (in && (c=='-' || c=='x' || (c>='0' && c<='9') || (c>='a' && c<='f') || (c>='A' && c<='F') || c=='h' || c=='H' || c=='o' || c=='O' || c==',' || c=='.'));
3362 
3363  if (in.gcount())
3364  in.putback(c);
3365  str[length-1] = '\0';
3366  a = Integer(str);
3367 
3368  return in;
3369 }
3370 
3371 std::ostream& operator<<(std::ostream& out, const Integer &a)
3372 {
3373  // Get relevant conversion specifications from ostream.
3374  long f = out.flags() & std::ios::basefield; // Get base digits.
3375  int base, block;
3376  char suffix;
3377  switch(f)
3378  {
3379  case std::ios::oct :
3380  base = 8;
3381  block = 8;
3382  suffix = 'o';
3383  break;
3384  case std::ios::hex :
3385  base = 16;
3386  block = 4;
3387  suffix = 'h';
3388  break;
3389  default :
3390  base = 10;
3391  block = 3;
3392  suffix = '.';
3393  }
3394 
3395  Integer temp1=a, temp2;
3396 
3397  if (a.IsNegative())
3398  {
3399  out << '-';
3400  temp1.Negate();
3401  }
3402 
3403  if (!a)
3404  out << '0';
3405 
3406  static const char upper[]="0123456789ABCDEF";
3407  static const char lower[]="0123456789abcdef";
3408 
3409  const char* vec = (out.flags() & std::ios::uppercase) ? upper : lower;
3410  unsigned i=0;
3411  SecBlock<char> s(a.BitCount() / (BitPrecision(base)-1) + 1);
3412 
3413  while (!!temp1)
3414  {
3415  word digit;
3416  Integer::Divide(digit, temp2, temp1, base);
3417  s[i++]=vec[digit];
3418  temp1.swap(temp2);
3419  }
3420 
3421  while (i--)
3422  {
3423  out << s[i];
3424 // if (i && !(i%block))
3425 // out << ",";
3426  }
3427  return out << suffix;
3428 }
3429 
3430 Integer& Integer::operator++()
3431 {
3432  if (NotNegative())
3433  {
3434  if (Increment(reg, reg.size()))
3435  {
3436  reg.CleanGrow(2*reg.size());
3437  reg[reg.size()/2]=1;
3438  }
3439  }
3440  else
3441  {
3442  word borrow = Decrement(reg, reg.size());
3443  assert(!borrow);
3444  if (WordCount()==0)
3445  *this = Zero();
3446  }
3447  return *this;
3448 }
3449 
3450 Integer& Integer::operator--()
3451 {
3452  if (IsNegative())
3453  {
3454  if (Increment(reg, reg.size()))
3455  {
3456  reg.CleanGrow(2*reg.size());
3457  reg[reg.size()/2]=1;
3458  }
3459  }
3460  else
3461  {
3462  if (Decrement(reg, reg.size()))
3463  *this = -One();
3464  }
3465  return *this;
3466 }
3467 
3468 void PositiveAdd(Integer &sum, const Integer &a, const Integer& b)
3469 {
3470  int carry;
3471  if (a.reg.size() == b.reg.size())
3472  carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3473  else if (a.reg.size() > b.reg.size())
3474  {
3475  carry = Add(sum.reg, a.reg, b.reg, b.reg.size());
3476  CopyWords(sum.reg+b.reg.size(), a.reg+b.reg.size(), a.reg.size()-b.reg.size());
3477  carry = Increment(sum.reg+b.reg.size(), a.reg.size()-b.reg.size(), carry);
3478  }
3479  else
3480  {
3481  carry = Add(sum.reg, a.reg, b.reg, a.reg.size());
3482  CopyWords(sum.reg+a.reg.size(), b.reg+a.reg.size(), b.reg.size()-a.reg.size());
3483  carry = Increment(sum.reg+a.reg.size(), b.reg.size()-a.reg.size(), carry);
3484  }
3485 
3486  if (carry)
3487  {
3488  sum.reg.CleanGrow(2*sum.reg.size());
3489  sum.reg[sum.reg.size()/2] = 1;
3490  }
3491  sum.sign = Integer::POSITIVE;
3492 }
3493 
3494 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
3495 {
3496  unsigned aSize = a.WordCount();
3497  aSize += aSize%2;
3498  unsigned bSize = b.WordCount();
3499  bSize += bSize%2;
3500 
3501  if (aSize == bSize)
3502  {
3503  if (Compare(a.reg, b.reg, aSize) >= 0)
3504  {
3505  Subtract(diff.reg, a.reg, b.reg, aSize);
3506  diff.sign = Integer::POSITIVE;
3507  }
3508  else
3509  {
3510  Subtract(diff.reg, b.reg, a.reg, aSize);
3511  diff.sign = Integer::NEGATIVE;
3512  }
3513  }
3514  else if (aSize > bSize)
3515  {
3516  word borrow = Subtract(diff.reg, a.reg, b.reg, bSize);
3517  CopyWords(diff.reg+bSize, a.reg+bSize, aSize-bSize);
3518  borrow = Decrement(diff.reg+bSize, aSize-bSize, borrow);
3519  assert(!borrow);
3520  diff.sign = Integer::POSITIVE;
3521  }
3522  else
3523  {
3524  word borrow = Subtract(diff.reg, b.reg, a.reg, aSize);
3525  CopyWords(diff.reg+aSize, b.reg+aSize, bSize-aSize);
3526  borrow = Decrement(diff.reg+aSize, bSize-aSize, borrow);
3527  assert(!borrow);
3528  diff.sign = Integer::NEGATIVE;
3529  }
3530 }
3531 
3532 // MSVC .NET 2003 workaround
3533 template <class T> inline const T& STDMAX2(const T& a, const T& b)
3534 {
3535  return a < b ? b : a;
3536 }
3537 
3538 Integer Integer::Plus(const Integer& b) const
3539 {
3540  Integer sum((word)0, STDMAX2(reg.size(), b.reg.size()));
3541  if (NotNegative())
3542  {
3543  if (b.NotNegative())
3544  PositiveAdd(sum, *this, b);
3545  else
3546  PositiveSubtract(sum, *this, b);
3547  }
3548  else
3549  {
3550  if (b.NotNegative())
3551  PositiveSubtract(sum, b, *this);
3552  else
3553  {
3554  PositiveAdd(sum, *this, b);
3555  sum.sign = Integer::NEGATIVE;
3556  }
3557  }
3558  return sum;
3559 }
3560 
3561 Integer& Integer::operator+=(const Integer& t)
3562 {
3563  reg.CleanGrow(t.reg.size());
3564  if (NotNegative())
3565  {
3566  if (t.NotNegative())
3567  PositiveAdd(*this, *this, t);
3568  else
3569  PositiveSubtract(*this, *this, t);
3570  }
3571  else
3572  {
3573  if (t.NotNegative())
3574  PositiveSubtract(*this, t, *this);
3575  else
3576  {
3577  PositiveAdd(*this, *this, t);
3578  sign = Integer::NEGATIVE;
3579  }
3580  }
3581  return *this;
3582 }
3583 
3584 Integer Integer::Minus(const Integer& b) const
3585 {
3586  Integer diff((word)0, STDMAX2(reg.size(), b.reg.size()));
3587  if (NotNegative())
3588  {
3589  if (b.NotNegative())
3590  PositiveSubtract(diff, *this, b);
3591  else
3592  PositiveAdd(diff, *this, b);
3593  }
3594  else
3595  {
3596  if (b.NotNegative())
3597  {
3598  PositiveAdd(diff, *this, b);
3599  diff.sign = Integer::NEGATIVE;
3600  }
3601  else
3602  PositiveSubtract(diff, b, *this);
3603  }
3604  return diff;
3605 }
3606 
3607 Integer& Integer::operator-=(const Integer& t)
3608 {
3609  reg.CleanGrow(t.reg.size());
3610  if (NotNegative())
3611  {
3612  if (t.NotNegative())
3613  PositiveSubtract(*this, *this, t);
3614  else
3615  PositiveAdd(*this, *this, t);
3616  }
3617  else
3618  {
3619  if (t.NotNegative())
3620  {
3621  PositiveAdd(*this, *this, t);
3622  sign = Integer::NEGATIVE;
3623  }
3624  else
3625  PositiveSubtract(*this, t, *this);
3626  }
3627  return *this;
3628 }
3629 
3630 Integer& Integer::operator<<=(size_t n)
3631 {
3632  const size_t wordCount = WordCount();
3633  const size_t shiftWords = n / WORD_BITS;
3634  const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
3635 
3636  reg.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
3637  ShiftWordsLeftByWords(reg, wordCount + shiftWords, shiftWords);
3638  ShiftWordsLeftByBits(reg+shiftWords, wordCount+BitsToWords(shiftBits), shiftBits);
3639  return *this;
3640 }
3641 
3642 Integer& Integer::operator>>=(size_t n)
3643 {
3644  const size_t wordCount = WordCount();
3645  const size_t shiftWords = n / WORD_BITS;
3646  const unsigned int shiftBits = (unsigned int)(n % WORD_BITS);
3647 
3648  ShiftWordsRightByWords(reg, wordCount, shiftWords);
3649  if (wordCount > shiftWords)
3650  ShiftWordsRightByBits(reg, wordCount-shiftWords, shiftBits);
3651  if (IsNegative() && WordCount()==0) // avoid -0
3652  *this = Zero();
3653  return *this;
3654 }
3655 
3656 void PositiveMultiply(Integer &product, const Integer &a, const Integer &b)
3657 {
3658  size_t aSize = RoundupSize(a.WordCount());
3659  size_t bSize = RoundupSize(b.WordCount());
3660 
3661  product.reg.CleanNew(RoundupSize(aSize+bSize));
3662  product.sign = Integer::POSITIVE;
3663 
3664  IntegerSecBlock workspace(aSize + bSize);
3665  AsymmetricMultiply(product.reg, workspace, a.reg, aSize, b.reg, bSize);
3666 }
3667 
3668 void Multiply(Integer &product, const Integer &a, const Integer &b)
3669 {
3670  PositiveMultiply(product, a, b);
3671 
3672  if (a.NotNegative() != b.NotNegative())
3673  product.Negate();
3674 }
3675 
3676 Integer Integer::Times(const Integer &b) const
3677 {
3678  Integer product;
3679  Multiply(product, *this, b);
3680  return product;
3681 }
3682 
3683 /*
3684 void PositiveDivide(Integer &remainder, Integer &quotient,
3685  const Integer &dividend, const Integer &divisor)
3686 {
3687  remainder.reg.CleanNew(divisor.reg.size());
3688  remainder.sign = Integer::POSITIVE;
3689  quotient.reg.New(0);
3690  quotient.sign = Integer::POSITIVE;
3691  unsigned i=dividend.BitCount();
3692  while (i--)
3693  {
3694  word overflow = ShiftWordsLeftByBits(remainder.reg, remainder.reg.size(), 1);
3695  remainder.reg[0] |= dividend[i];
3696  if (overflow || remainder >= divisor)
3697  {
3698  Subtract(remainder.reg, remainder.reg, divisor.reg, remainder.reg.size());
3699  quotient.SetBit(i);
3700  }
3701  }
3702 }
3703 */
3704 
3705 void PositiveDivide(Integer &remainder, Integer &quotient,
3706  const Integer &a, const Integer &b)
3707 {
3708  unsigned aSize = a.WordCount();
3709  unsigned bSize = b.WordCount();
3710 
3711  if (!bSize)
3712  throw Integer::DivideByZero();
3713 
3714  if (aSize < bSize)
3715  {
3716  remainder = a;
3717  remainder.sign = Integer::POSITIVE;
3718  quotient = Integer::Zero();
3719  return;
3720  }
3721 
3722  aSize += aSize%2; // round up to next even number
3723  bSize += bSize%2;
3724 
3725  remainder.reg.CleanNew(RoundupSize(bSize));
3726  remainder.sign = Integer::POSITIVE;
3727  quotient.reg.CleanNew(RoundupSize(aSize-bSize+2));
3728  quotient.sign = Integer::POSITIVE;
3729 
3730  IntegerSecBlock T(aSize+3*(bSize+2));
3731  Divide(remainder.reg, quotient.reg, T, a.reg, aSize, b.reg, bSize);
3732 }
3733 
3734 void Integer::Divide(Integer &remainder, Integer &quotient, const Integer &dividend, const Integer &divisor)
3735 {
3736  PositiveDivide(remainder, quotient, dividend, divisor);
3737 
3738  if (dividend.IsNegative())
3739  {
3740  quotient.Negate();
3741  if (remainder.NotZero())
3742  {
3743  --quotient;
3744  remainder = divisor.AbsoluteValue() - remainder;
3745  }
3746  }
3747 
3748  if (divisor.IsNegative())
3749  quotient.Negate();
3750 }
3751 
3752 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, unsigned int n)
3753 {
3754  q = a;
3755  q >>= n;
3756 
3757  const size_t wordCount = BitsToWords(n);
3758  if (wordCount <= a.WordCount())
3759  {
3760  r.reg.resize(RoundupSize(wordCount));
3761  CopyWords(r.reg, a.reg, wordCount);
3762  SetWords(r.reg+wordCount, 0, r.reg.size()-wordCount);
3763  if (n % WORD_BITS != 0)
3764  r.reg[wordCount-1] %= (word(1) << (n % WORD_BITS));
3765  }
3766  else
3767  {
3768  r.reg.resize(RoundupSize(a.WordCount()));
3769  CopyWords(r.reg, a.reg, r.reg.size());
3770  }
3771  r.sign = POSITIVE;
3772 
3773  if (a.IsNegative() && r.NotZero())
3774  {
3775  --q;
3776  r = Power2(n) - r;
3777  }
3778 }
3779 
3780 Integer Integer::DividedBy(const Integer &b) const
3781 {
3782  Integer remainder, quotient;
3783  Integer::Divide(remainder, quotient, *this, b);
3784  return quotient;
3785 }
3786 
3787 Integer Integer::Modulo(const Integer &b) const
3788 {
3789  Integer remainder, quotient;
3790  Integer::Divide(remainder, quotient, *this, b);
3791  return remainder;
3792 }
3793 
3794 void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend, word divisor)
3795 {
3796  if (!divisor)
3797  throw Integer::DivideByZero();
3798 
3799  assert(divisor);
3800 
3801  if ((divisor & (divisor-1)) == 0) // divisor is a power of 2
3802  {
3803  quotient = dividend >> (BitPrecision(divisor)-1);
3804  remainder = dividend.reg[0] & (divisor-1);
3805  return;
3806  }
3807 
3808  unsigned int i = dividend.WordCount();
3809  quotient.reg.CleanNew(RoundupSize(i));
3810  remainder = 0;
3811  while (i--)
3812  {
3813  quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor;
3814  remainder = DWord(dividend.reg[i], remainder) % divisor;
3815  }
3816 
3817  if (dividend.NotNegative())
3818  quotient.sign = POSITIVE;
3819  else
3820  {
3821  quotient.sign = NEGATIVE;
3822  if (remainder)
3823  {
3824  --quotient;
3825  remainder = divisor - remainder;
3826  }
3827  }
3828 }
3829 
3830 Integer Integer::DividedBy(word b) const
3831 {
3832  word remainder;
3833  Integer quotient;
3834  Integer::Divide(remainder, quotient, *this, b);
3835  return quotient;
3836 }
3837 
3838 word Integer::Modulo(word divisor) const
3839 {
3840  if (!divisor)
3841  throw Integer::DivideByZero();
3842 
3843  assert(divisor);
3844 
3845  word remainder;
3846 
3847  if ((divisor & (divisor-1)) == 0) // divisor is a power of 2
3848  remainder = reg[0] & (divisor-1);
3849  else
3850  {
3851  unsigned int i = WordCount();
3852 
3853  if (divisor <= 5)
3854  {
3855  DWord sum(0, 0);
3856  while (i--)
3857  sum += reg[i];
3858  remainder = sum % divisor;
3859  }
3860  else
3861  {
3862  remainder = 0;
3863  while (i--)
3864  remainder = DWord(reg[i], remainder) % divisor;
3865  }
3866  }
3867 
3868  if (IsNegative() && remainder)
3869  remainder = divisor - remainder;
3870 
3871  return remainder;
3872 }
3873 
3874 void Integer::Negate()
3875 {
3876  if (!!(*this)) // don't flip sign if *this==0
3877  sign = Sign(1-sign);
3878 }
3879 
3880 int Integer::PositiveCompare(const Integer& t) const
3881 {
3882  unsigned size = WordCount(), tSize = t.WordCount();
3883 
3884  if (size == tSize)
3885  return CryptoPP::Compare(reg, t.reg, size);
3886  else
3887  return size > tSize ? 1 : -1;
3888 }
3889 
3890 int Integer::Compare(const Integer& t) const
3891 {
3892  if (NotNegative())
3893  {
3894  if (t.NotNegative())
3895  return PositiveCompare(t);
3896  else
3897  return 1;
3898  }
3899  else
3900  {
3901  if (t.NotNegative())
3902  return -1;
3903  else
3904  return -PositiveCompare(t);
3905  }
3906 }
3907 
3909 {
3910  if (!IsPositive())
3911  return Zero();
3912 
3913  // overestimate square root
3914  Integer x, y = Power2((BitCount()+1)/2);
3915  assert(y*y >= *this);
3916 
3917  do
3918  {
3919  x = y;
3920  y = (x + *this/x) >> 1;
3921  } while (y<x);
3922 
3923  return x;
3924 }
3925 
3926 bool Integer::IsSquare() const
3927 {
3928  Integer r = SquareRoot();
3929  return *this == r.Squared();
3930 }
3931 
3932 bool Integer::IsUnit() const
3933 {
3934  return (WordCount() == 1) && (reg[0] == 1);
3935 }
3936 
3938 {
3939  return IsUnit() ? *this : Zero();
3940 }
3941 
3942 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
3943 {
3944  return x*y%m;
3945 }
3946 
3947 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
3948 {
3949  ModularArithmetic mr(m);
3950  return mr.Exponentiate(x, e);
3951 }
3952 
3954 {
3955  return EuclideanDomainOf<Integer>().Gcd(a, b);
3956 }
3957 
3959 {
3960  assert(m.NotNegative());
3961 
3962  if (IsNegative())
3963  return Modulo(m).InverseMod(m);
3964 
3965  if (m.IsEven())
3966  {
3967  if (!m || IsEven())
3968  return Zero(); // no inverse
3969  if (*this == One())
3970  return One();
3971 
3972  Integer u = m.Modulo(*this).InverseMod(*this);
3973  return !u ? Zero() : (m*(*this-u)+1)/(*this);
3974  }
3975 
3976  SecBlock<word> T(m.reg.size() * 4);
3977  Integer r((word)0, m.reg.size());
3978  unsigned k = AlmostInverse(r.reg, T, reg, reg.size(), m.reg, m.reg.size());
3979  DivideByPower2Mod(r.reg, r.reg, k, m.reg, m.reg.size());
3980  return r;
3981 }
3982 
3983 word Integer::InverseMod(word mod) const
3984 {
3985  word g0 = mod, g1 = *this % mod;
3986  word v0 = 0, v1 = 1;
3987  word y;
3988 
3989  while (g1)
3990  {
3991  if (g1 == 1)
3992  return v1;
3993  y = g0 / g1;
3994  g0 = g0 % g1;
3995  v0 += y * v1;
3996 
3997  if (!g0)
3998  break;
3999  if (g0 == 1)
4000  return mod-v0;
4001  y = g1 / g0;
4002  g1 = g1 % g0;
4003  v1 += y * v0;
4004  }
4005  return 0;
4006 }
4007 
4008 // ********************************************************
4009 
4010 ModularArithmetic::ModularArithmetic(BufferedTransformation &bt)
4011 {
4012  BERSequenceDecoder seq(bt);
4013  OID oid(seq);
4014  if (oid != ASN1::prime_field())
4015  BERDecodeError();
4016  m_modulus.BERDecode(seq);
4017  seq.MessageEnd();
4018  m_result.reg.resize(m_modulus.reg.size());
4019 }
4020 
4021 void ModularArithmetic::DEREncode(BufferedTransformation &bt) const
4022 {
4023  DERSequenceEncoder seq(bt);
4024  ASN1::prime_field().DEREncode(seq);
4025  m_modulus.DEREncode(seq);
4026  seq.MessageEnd();
4027 }
4028 
4029 void ModularArithmetic::DEREncodeElement(BufferedTransformation &out, const Element &a) const
4030 {
4031  a.DEREncodeAsOctetString(out, MaxElementByteLength());
4032 }
4033 
4034 void ModularArithmetic::BERDecodeElement(BufferedTransformation &in, Element &a) const
4035 {
4036  a.BERDecodeAsOctetString(in, MaxElementByteLength());
4037 }
4038 
4039 const Integer& ModularArithmetic::Half(const Integer &a) const
4040 {
4041  if (a.reg.size()==m_modulus.reg.size())
4042  {
4043  CryptoPP::DivideByPower2Mod(m_result.reg.begin(), a.reg, 1, m_modulus.reg, a.reg.size());
4044  return m_result;
4045  }
4046  else
4047  return m_result1 = (a.IsEven() ? (a >> 1) : ((a+m_modulus) >> 1));
4048 }
4049 
4050 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
4051 {
4052  if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4053  {
4054  if (CryptoPP::Add(m_result.reg.begin(), a.reg, b.reg, a.reg.size())
4055  || Compare(m_result.reg, m_modulus.reg, a.reg.size()) >= 0)
4056  {
4057  CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
4058  }
4059  return m_result;
4060  }
4061  else
4062  {
4063  m_result1 = a+b;
4064  if (m_result1 >= m_modulus)
4065  m_result1 -= m_modulus;
4066  return m_result1;
4067  }
4068 }
4069 
4070 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
4071 {
4072  if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4073  {
4074  if (CryptoPP::Add(a.reg, a.reg, b.reg, a.reg.size())
4075  || Compare(a.reg, m_modulus.reg, a.reg.size()) >= 0)
4076  {
4077  CryptoPP::Subtract(a.reg, a.reg, m_modulus.reg, a.reg.size());
4078  }
4079  }
4080  else
4081  {
4082  a+=b;
4083  if (a>=m_modulus)
4084  a-=m_modulus;
4085  }
4086 
4087  return a;
4088 }
4089 
4090 const Integer& ModularArithmetic::Subtract(const Integer &a, const Integer &b) const
4091 {
4092  if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4093  {
4094  if (CryptoPP::Subtract(m_result.reg.begin(), a.reg, b.reg, a.reg.size()))
4095  CryptoPP::Add(m_result.reg.begin(), m_result.reg, m_modulus.reg, a.reg.size());
4096  return m_result;
4097  }
4098  else
4099  {
4100  m_result1 = a-b;
4101  if (m_result1.IsNegative())
4102  m_result1 += m_modulus;
4103  return m_result1;
4104  }
4105 }
4106 
4107 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
4108 {
4109  if (a.reg.size()==m_modulus.reg.size() && b.reg.size()==m_modulus.reg.size())
4110  {
4111  if (CryptoPP::Subtract(a.reg, a.reg, b.reg, a.reg.size()))
4112  CryptoPP::Add(a.reg, a.reg, m_modulus.reg, a.reg.size());
4113  }
4114  else
4115  {
4116  a-=b;
4117  if (a.IsNegative())
4118  a+=m_modulus;
4119  }
4120 
4121  return a;
4122 }
4123 
4124 const Integer& ModularArithmetic::Inverse(const Integer &a) const
4125 {
4126  if (!a)
4127  return a;
4128 
4129  CopyWords(m_result.reg.begin(), m_modulus.reg, m_modulus.reg.size());
4130  if (CryptoPP::Subtract(m_result.reg.begin(), m_result.reg, a.reg, a.reg.size()))
4131  Decrement(m_result.reg.begin()+a.reg.size(), m_modulus.reg.size()-a.reg.size());
4132 
4133  return m_result;
4134 }
4135 
4136 Integer ModularArithmetic::CascadeExponentiate(const Integer &x, const Integer &e1, const Integer &y, const Integer &e2) const
4137 {
4138  if (m_modulus.IsOdd())
4139  {
4140  MontgomeryRepresentation dr(m_modulus);
4141  return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, dr.ConvertIn(y), e2));
4142  }
4143  else
4144  return AbstractRing<Integer>::CascadeExponentiate(x, e1, y, e2);
4145 }
4146 
4147 void ModularArithmetic::SimultaneousExponentiate(Integer *results, const Integer &base, const Integer *exponents, unsigned int exponentsCount) const
4148 {
4149  if (m_modulus.IsOdd())
4150  {
4151  MontgomeryRepresentation dr(m_modulus);
4152  dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, exponentsCount);
4153  for (unsigned int i=0; i<exponentsCount; i++)
4154  results[i] = dr.ConvertOut(results[i]);
4155  }
4156  else
4157  AbstractRing<Integer>::SimultaneousExponentiate(results, base, exponents, exponentsCount);
4158 }
4159 
4160 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m) // modulus must be odd
4161  : ModularArithmetic(m),
4162  m_u((word)0, m_modulus.reg.size()),
4163  m_workspace(5*m_modulus.reg.size())
4164 {
4165  if (!m_modulus.IsOdd())
4166  throw InvalidArgument("MontgomeryRepresentation: Montgomery representation requires an odd modulus");
4167 
4168  RecursiveInverseModPower2(m_u.reg, m_workspace, m_modulus.reg, m_modulus.reg.size());
4169 }
4170 
4171 const Integer& MontgomeryRepresentation::Multiply(const Integer &a, const Integer &b) const
4172 {
4173  word *const T = m_workspace.begin();
4174  word *const R = m_result.reg.begin();
4175  const size_t N = m_modulus.reg.size();
4176  assert(a.reg.size()<=N && b.reg.size()<=N);
4177 
4178  AsymmetricMultiply(T, T+2*N, a.reg, a.reg.size(), b.reg, b.reg.size());
4179  SetWords(T+a.reg.size()+b.reg.size(), 0, 2*N-a.reg.size()-b.reg.size());
4180  MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4181  return m_result;
4182 }
4183 
4184 const Integer& MontgomeryRepresentation::Square(const Integer &a) const
4185 {
4186  word *const T = m_workspace.begin();
4187  word *const R = m_result.reg.begin();
4188  const size_t N = m_modulus.reg.size();
4189  assert(a.reg.size()<=N);
4190 
4191  CryptoPP::Square(T, T+2*N, a.reg, a.reg.size());
4192  SetWords(T+2*a.reg.size(), 0, 2*N-2*a.reg.size());
4193  MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4194  return m_result;
4195 }
4196 
4197 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
4198 {
4199  word *const T = m_workspace.begin();
4200  word *const R = m_result.reg.begin();
4201  const size_t N = m_modulus.reg.size();
4202  assert(a.reg.size()<=N);
4203 
4204  CopyWords(T, a.reg, a.reg.size());
4205  SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4206  MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4207  return m_result;
4208 }
4209 
4210 const Integer& MontgomeryRepresentation::MultiplicativeInverse(const Integer &a) const
4211 {
4212 // return (EuclideanMultiplicativeInverse(a, modulus)<<(2*WORD_BITS*modulus.reg.size()))%modulus;
4213  word *const T = m_workspace.begin();
4214  word *const R = m_result.reg.begin();
4215  const size_t N = m_modulus.reg.size();
4216  assert(a.reg.size()<=N);
4217 
4218  CopyWords(T, a.reg, a.reg.size());
4219  SetWords(T+a.reg.size(), 0, 2*N-a.reg.size());
4220  MontgomeryReduce(R, T+2*N, T, m_modulus.reg, m_u.reg, N);
4221  unsigned k = AlmostInverse(R, T, R, N, m_modulus.reg, N);
4222 
4223 // cout << "k=" << k << " N*32=" << 32*N << endl;
4224 
4225  if (k>N*WORD_BITS)
4226  DivideByPower2Mod(R, R, k-N*WORD_BITS, m_modulus.reg, N);
4227  else
4228  MultiplyByPower2Mod(R, R, N*WORD_BITS-k, m_modulus.reg, N);
4229 
4230  return m_result;
4231 }
4232 
4233 NAMESPACE_END
4234 
4235 #endif