Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32f_sin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
72 #include <inttypes.h>
73 #include <math.h>
74 #include <stdio.h>
75 
76 #ifndef INCLUDED_volk_32f_sin_32f_a_H
77 #define INCLUDED_volk_32f_sin_32f_a_H
78 #ifdef LV_HAVE_AVX512F
79 
80 #include <immintrin.h>
81 static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
82  const float* inVector,
83  unsigned int num_points)
84 {
85  float* sinPtr = sinVector;
86  const float* inPtr = inVector;
87 
88  unsigned int number = 0;
89  unsigned int sixteenPoints = num_points / 16;
90  unsigned int i = 0;
91 
92  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
93  fones;
94  __m512 sine, cosine;
95  __m512i q, zeros, ones, twos, fours;
96 
97  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
98  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
99  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
100  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
101  ffours = _mm512_set1_ps(4.0);
102  ftwos = _mm512_set1_ps(2.0);
103  fones = _mm512_set1_ps(1.0);
104  zeros = _mm512_setzero_epi32();
105  ones = _mm512_set1_epi32(1);
106  twos = _mm512_set1_epi32(2);
107  fours = _mm512_set1_epi32(4);
108 
109  cp1 = _mm512_set1_ps(1.0);
110  cp2 = _mm512_set1_ps(0.08333333333333333);
111  cp3 = _mm512_set1_ps(0.002777777777777778);
112  cp4 = _mm512_set1_ps(4.96031746031746e-05);
113  cp5 = _mm512_set1_ps(5.511463844797178e-07);
114  __mmask16 condition1, condition2, ltZero;
115 
116  for (; number < sixteenPoints; number++) {
117  aVal = _mm512_load_ps(inPtr);
118  // s = fabs(aVal)
119  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
120 
121  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
122  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
123  // r = q + q&1, q indicates quadrant, r gives
124  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
125 
126  s = _mm512_fnmadd_ps(r, pio4A, s);
127  s = _mm512_fnmadd_ps(r, pio4B, s);
128  s = _mm512_fnmadd_ps(r, pio4C, s);
129 
130  s = _mm512_div_ps(
131  s,
132  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
133  s = _mm512_mul_ps(s, s);
134  // Evaluate Taylor series
135  s = _mm512_mul_ps(
136  _mm512_fmadd_ps(
137  _mm512_fmsub_ps(
138  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
139  s,
140  cp1),
141  s);
142 
143  for (i = 0; i < 3; i++)
144  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
145  s = _mm512_div_ps(s, ftwos);
146 
147  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
148  cosine = _mm512_sub_ps(fones, s);
149 
150  condition1 = _mm512_cmpneq_epi32_mask(
151  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
152  ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
153  condition2 = _mm512_kxor(
154  _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
155 
156  sine = _mm512_mask_blend_ps(condition1, sine, cosine);
157  sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
158  _mm512_store_ps(sinPtr, sine);
159  inPtr += 16;
160  sinPtr += 16;
161  }
162 
163  number = sixteenPoints * 16;
164  for (; number < num_points; number++) {
165  *sinPtr++ = sinf(*inPtr++);
166  }
167 }
168 #endif
169 #if LV_HAVE_AVX2 && LV_HAVE_FMA
170 #include <immintrin.h>
171 
172 static inline void
173 volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
174 {
175  float* bPtr = bVector;
176  const float* aPtr = aVector;
177 
178  unsigned int number = 0;
179  unsigned int eighthPoints = num_points / 8;
180  unsigned int i = 0;
181 
182  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
183  fzeroes;
184  __m256 sine, cosine, condition1, condition2;
185  __m256i q, r, ones, twos, fours;
186 
187  m4pi = _mm256_set1_ps(1.273239545);
188  pio4A = _mm256_set1_ps(0.78515625);
189  pio4B = _mm256_set1_ps(0.241876e-3);
190  ffours = _mm256_set1_ps(4.0);
191  ftwos = _mm256_set1_ps(2.0);
192  fones = _mm256_set1_ps(1.0);
193  fzeroes = _mm256_setzero_ps();
194  ones = _mm256_set1_epi32(1);
195  twos = _mm256_set1_epi32(2);
196  fours = _mm256_set1_epi32(4);
197 
198  cp1 = _mm256_set1_ps(1.0);
199  cp2 = _mm256_set1_ps(0.83333333e-1);
200  cp3 = _mm256_set1_ps(0.2777778e-2);
201  cp4 = _mm256_set1_ps(0.49603e-4);
202  cp5 = _mm256_set1_ps(0.551e-6);
203 
204  for (; number < eighthPoints; number++) {
205  aVal = _mm256_load_ps(aPtr);
206  s = _mm256_sub_ps(aVal,
207  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
208  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
209  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
210  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
211 
212  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
213  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
214 
215  s = _mm256_div_ps(
216  s,
217  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
218  s = _mm256_mul_ps(s, s);
219  // Evaluate Taylor series
220  s = _mm256_mul_ps(
221  _mm256_fmadd_ps(
222  _mm256_fmsub_ps(
223  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
224  s,
225  cp1),
226  s);
227 
228  for (i = 0; i < 3; i++) {
229  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
230  }
231  s = _mm256_div_ps(s, ftwos);
232 
233  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
234  cosine = _mm256_sub_ps(fones, s);
235 
236  condition1 = _mm256_cmp_ps(
237  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
238  fzeroes,
239  _CMP_NEQ_UQ);
240  condition2 = _mm256_cmp_ps(
241  _mm256_cmp_ps(
242  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
243  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
244  _CMP_NEQ_UQ);
245  // Need this condition only for cos
246  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
247  // twos), fours)), fzeroes);
248 
249  sine =
250  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
251  sine = _mm256_sub_ps(
252  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
253  _mm256_store_ps(bPtr, sine);
254  aPtr += 8;
255  bPtr += 8;
256  }
257 
258  number = eighthPoints * 8;
259  for (; number < num_points; number++) {
260  *bPtr++ = sin(*aPtr++);
261  }
262 }
263 
264 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
265 
266 #ifdef LV_HAVE_AVX2
267 #include <immintrin.h>
268 
269 static inline void
270 volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
271 {
272  float* bPtr = bVector;
273  const float* aPtr = aVector;
274 
275  unsigned int number = 0;
276  unsigned int eighthPoints = num_points / 8;
277  unsigned int i = 0;
278 
279  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
280  fzeroes;
281  __m256 sine, cosine, condition1, condition2;
282  __m256i q, r, ones, twos, fours;
283 
284  m4pi = _mm256_set1_ps(1.273239545);
285  pio4A = _mm256_set1_ps(0.78515625);
286  pio4B = _mm256_set1_ps(0.241876e-3);
287  ffours = _mm256_set1_ps(4.0);
288  ftwos = _mm256_set1_ps(2.0);
289  fones = _mm256_set1_ps(1.0);
290  fzeroes = _mm256_setzero_ps();
291  ones = _mm256_set1_epi32(1);
292  twos = _mm256_set1_epi32(2);
293  fours = _mm256_set1_epi32(4);
294 
295  cp1 = _mm256_set1_ps(1.0);
296  cp2 = _mm256_set1_ps(0.83333333e-1);
297  cp3 = _mm256_set1_ps(0.2777778e-2);
298  cp4 = _mm256_set1_ps(0.49603e-4);
299  cp5 = _mm256_set1_ps(0.551e-6);
300 
301  for (; number < eighthPoints; number++) {
302  aVal = _mm256_load_ps(aPtr);
303  s = _mm256_sub_ps(aVal,
304  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
305  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
306  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
307  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
308 
309  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
310  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
311 
312  s = _mm256_div_ps(
313  s,
314  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
315  s = _mm256_mul_ps(s, s);
316  // Evaluate Taylor series
317  s = _mm256_mul_ps(
318  _mm256_add_ps(
319  _mm256_mul_ps(
320  _mm256_sub_ps(
321  _mm256_mul_ps(
322  _mm256_add_ps(
323  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
324  s),
325  cp3),
326  s),
327  cp2),
328  s),
329  cp1),
330  s);
331 
332  for (i = 0; i < 3; i++) {
333  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
334  }
335  s = _mm256_div_ps(s, ftwos);
336 
337  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
338  cosine = _mm256_sub_ps(fones, s);
339 
340  condition1 = _mm256_cmp_ps(
341  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
342  fzeroes,
343  _CMP_NEQ_UQ);
344  condition2 = _mm256_cmp_ps(
345  _mm256_cmp_ps(
346  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
347  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
348  _CMP_NEQ_UQ);
349  // Need this condition only for cos
350  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
351  // twos), fours)), fzeroes);
352 
353  sine =
354  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
355  sine = _mm256_sub_ps(
356  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
357  _mm256_store_ps(bPtr, sine);
358  aPtr += 8;
359  bPtr += 8;
360  }
361 
362  number = eighthPoints * 8;
363  for (; number < num_points; number++) {
364  *bPtr++ = sin(*aPtr++);
365  }
366 }
367 
368 #endif /* LV_HAVE_AVX2 for aligned */
369 
370 #ifdef LV_HAVE_SSE4_1
371 #include <smmintrin.h>
372 
373 static inline void
374 volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
375 {
376  float* bPtr = bVector;
377  const float* aPtr = aVector;
378 
379  unsigned int number = 0;
380  unsigned int quarterPoints = num_points / 4;
381  unsigned int i = 0;
382 
383  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
384  fzeroes;
385  __m128 sine, cosine, condition1, condition2;
386  __m128i q, r, ones, twos, fours;
387 
388  m4pi = _mm_set1_ps(1.273239545);
389  pio4A = _mm_set1_ps(0.78515625);
390  pio4B = _mm_set1_ps(0.241876e-3);
391  ffours = _mm_set1_ps(4.0);
392  ftwos = _mm_set1_ps(2.0);
393  fones = _mm_set1_ps(1.0);
394  fzeroes = _mm_setzero_ps();
395  ones = _mm_set1_epi32(1);
396  twos = _mm_set1_epi32(2);
397  fours = _mm_set1_epi32(4);
398 
399  cp1 = _mm_set1_ps(1.0);
400  cp2 = _mm_set1_ps(0.83333333e-1);
401  cp3 = _mm_set1_ps(0.2777778e-2);
402  cp4 = _mm_set1_ps(0.49603e-4);
403  cp5 = _mm_set1_ps(0.551e-6);
404 
405  for (; number < quarterPoints; number++) {
406  aVal = _mm_load_ps(aPtr);
407  s = _mm_sub_ps(aVal,
408  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
409  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
410  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
411 
412  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
413  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
414 
415  s = _mm_div_ps(
416  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
417  s = _mm_mul_ps(s, s);
418  // Evaluate Taylor series
419  s = _mm_mul_ps(
420  _mm_add_ps(
421  _mm_mul_ps(
422  _mm_sub_ps(
423  _mm_mul_ps(
424  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
425  cp3),
426  s),
427  cp2),
428  s),
429  cp1),
430  s);
431 
432  for (i = 0; i < 3; i++) {
433  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
434  }
435  s = _mm_div_ps(s, ftwos);
436 
437  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
438  cosine = _mm_sub_ps(fones, s);
439 
440  condition1 = _mm_cmpneq_ps(
441  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
442  condition2 = _mm_cmpneq_ps(
443  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
444  _mm_cmplt_ps(aVal, fzeroes));
445  // Need this condition only for cos
446  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
447  // twos), fours)), fzeroes);
448 
449  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
450  sine =
451  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
452  _mm_store_ps(bPtr, sine);
453  aPtr += 4;
454  bPtr += 4;
455  }
456 
457  number = quarterPoints * 4;
458  for (; number < num_points; number++) {
459  *bPtr++ = sinf(*aPtr++);
460  }
461 }
462 
463 #endif /* LV_HAVE_SSE4_1 for aligned */
464 
465 
466 #endif /* INCLUDED_volk_32f_sin_32f_a_H */
467 
468 #ifndef INCLUDED_volk_32f_sin_32f_u_H
469 #define INCLUDED_volk_32f_sin_32f_u_H
470 
471 #ifdef LV_HAVE_AVX512F
472 
473 #include <immintrin.h>
474 static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
475  const float* inVector,
476  unsigned int num_points)
477 {
478  float* sinPtr = sinVector;
479  const float* inPtr = inVector;
480 
481  unsigned int number = 0;
482  unsigned int sixteenPoints = num_points / 16;
483  unsigned int i = 0;
484 
485  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
486  fones;
487  __m512 sine, cosine;
488  __m512i q, zeros, ones, twos, fours;
489 
490  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
491  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
492  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
493  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
494  ffours = _mm512_set1_ps(4.0);
495  ftwos = _mm512_set1_ps(2.0);
496  fones = _mm512_set1_ps(1.0);
497  zeros = _mm512_setzero_epi32();
498  ones = _mm512_set1_epi32(1);
499  twos = _mm512_set1_epi32(2);
500  fours = _mm512_set1_epi32(4);
501 
502  cp1 = _mm512_set1_ps(1.0);
503  cp2 = _mm512_set1_ps(0.08333333333333333);
504  cp3 = _mm512_set1_ps(0.002777777777777778);
505  cp4 = _mm512_set1_ps(4.96031746031746e-05);
506  cp5 = _mm512_set1_ps(5.511463844797178e-07);
507  __mmask16 condition1, condition2, ltZero;
508 
509  for (; number < sixteenPoints; number++) {
510  aVal = _mm512_loadu_ps(inPtr);
511  // s = fabs(aVal)
512  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
513 
514  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
515  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
516  // r = q + q&1, q indicates quadrant, r gives
517  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
518 
519  s = _mm512_fnmadd_ps(r, pio4A, s);
520  s = _mm512_fnmadd_ps(r, pio4B, s);
521  s = _mm512_fnmadd_ps(r, pio4C, s);
522 
523  s = _mm512_div_ps(
524  s,
525  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
526  s = _mm512_mul_ps(s, s);
527  // Evaluate Taylor series
528  s = _mm512_mul_ps(
529  _mm512_fmadd_ps(
530  _mm512_fmsub_ps(
531  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
532  s,
533  cp1),
534  s);
535 
536  for (i = 0; i < 3; i++)
537  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
538  s = _mm512_div_ps(s, ftwos);
539 
540  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
541  cosine = _mm512_sub_ps(fones, s);
542 
543  condition1 = _mm512_cmpneq_epi32_mask(
544  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
545  ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
546  condition2 = _mm512_kxor(
547  _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
548 
549  sine = _mm512_mask_blend_ps(condition1, sine, cosine);
550  sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
551  _mm512_storeu_ps(sinPtr, sine);
552  inPtr += 16;
553  sinPtr += 16;
554  }
555 
556  number = sixteenPoints * 16;
557  for (; number < num_points; number++) {
558  *sinPtr++ = sinf(*inPtr++);
559  }
560 }
561 #endif
562 
563 #if LV_HAVE_AVX2 && LV_HAVE_FMA
564 #include <immintrin.h>
565 
566 static inline void
567 volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
568 {
569  float* bPtr = bVector;
570  const float* aPtr = aVector;
571 
572  unsigned int number = 0;
573  unsigned int eighthPoints = num_points / 8;
574  unsigned int i = 0;
575 
576  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
577  fzeroes;
578  __m256 sine, cosine, condition1, condition2;
579  __m256i q, r, ones, twos, fours;
580 
581  m4pi = _mm256_set1_ps(1.273239545);
582  pio4A = _mm256_set1_ps(0.78515625);
583  pio4B = _mm256_set1_ps(0.241876e-3);
584  ffours = _mm256_set1_ps(4.0);
585  ftwos = _mm256_set1_ps(2.0);
586  fones = _mm256_set1_ps(1.0);
587  fzeroes = _mm256_setzero_ps();
588  ones = _mm256_set1_epi32(1);
589  twos = _mm256_set1_epi32(2);
590  fours = _mm256_set1_epi32(4);
591 
592  cp1 = _mm256_set1_ps(1.0);
593  cp2 = _mm256_set1_ps(0.83333333e-1);
594  cp3 = _mm256_set1_ps(0.2777778e-2);
595  cp4 = _mm256_set1_ps(0.49603e-4);
596  cp5 = _mm256_set1_ps(0.551e-6);
597 
598  for (; number < eighthPoints; number++) {
599  aVal = _mm256_loadu_ps(aPtr);
600  s = _mm256_sub_ps(aVal,
601  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
602  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
603  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
604  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
605 
606  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
607  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
608 
609  s = _mm256_div_ps(
610  s,
611  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
612  s = _mm256_mul_ps(s, s);
613  // Evaluate Taylor series
614  s = _mm256_mul_ps(
615  _mm256_fmadd_ps(
616  _mm256_fmsub_ps(
617  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
618  s,
619  cp1),
620  s);
621 
622  for (i = 0; i < 3; i++) {
623  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
624  }
625  s = _mm256_div_ps(s, ftwos);
626 
627  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
628  cosine = _mm256_sub_ps(fones, s);
629 
630  condition1 = _mm256_cmp_ps(
631  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
632  fzeroes,
633  _CMP_NEQ_UQ);
634  condition2 = _mm256_cmp_ps(
635  _mm256_cmp_ps(
636  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
637  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
638  _CMP_NEQ_UQ);
639  // Need this condition only for cos
640  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
641  // twos), fours)), fzeroes);
642 
643  sine =
644  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
645  sine = _mm256_sub_ps(
646  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
647  _mm256_storeu_ps(bPtr, sine);
648  aPtr += 8;
649  bPtr += 8;
650  }
651 
652  number = eighthPoints * 8;
653  for (; number < num_points; number++) {
654  *bPtr++ = sin(*aPtr++);
655  }
656 }
657 
658 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
659 
660 #ifdef LV_HAVE_AVX2
661 #include <immintrin.h>
662 
663 static inline void
664 volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
665 {
666  float* bPtr = bVector;
667  const float* aPtr = aVector;
668 
669  unsigned int number = 0;
670  unsigned int eighthPoints = num_points / 8;
671  unsigned int i = 0;
672 
673  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
674  fzeroes;
675  __m256 sine, cosine, condition1, condition2;
676  __m256i q, r, ones, twos, fours;
677 
678  m4pi = _mm256_set1_ps(1.273239545);
679  pio4A = _mm256_set1_ps(0.78515625);
680  pio4B = _mm256_set1_ps(0.241876e-3);
681  ffours = _mm256_set1_ps(4.0);
682  ftwos = _mm256_set1_ps(2.0);
683  fones = _mm256_set1_ps(1.0);
684  fzeroes = _mm256_setzero_ps();
685  ones = _mm256_set1_epi32(1);
686  twos = _mm256_set1_epi32(2);
687  fours = _mm256_set1_epi32(4);
688 
689  cp1 = _mm256_set1_ps(1.0);
690  cp2 = _mm256_set1_ps(0.83333333e-1);
691  cp3 = _mm256_set1_ps(0.2777778e-2);
692  cp4 = _mm256_set1_ps(0.49603e-4);
693  cp5 = _mm256_set1_ps(0.551e-6);
694 
695  for (; number < eighthPoints; number++) {
696  aVal = _mm256_loadu_ps(aPtr);
697  s = _mm256_sub_ps(aVal,
698  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
699  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
700  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
701  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
702 
703  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
704  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
705 
706  s = _mm256_div_ps(
707  s,
708  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
709  s = _mm256_mul_ps(s, s);
710  // Evaluate Taylor series
711  s = _mm256_mul_ps(
712  _mm256_add_ps(
713  _mm256_mul_ps(
714  _mm256_sub_ps(
715  _mm256_mul_ps(
716  _mm256_add_ps(
717  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
718  s),
719  cp3),
720  s),
721  cp2),
722  s),
723  cp1),
724  s);
725 
726  for (i = 0; i < 3; i++) {
727  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
728  }
729  s = _mm256_div_ps(s, ftwos);
730 
731  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
732  cosine = _mm256_sub_ps(fones, s);
733 
734  condition1 = _mm256_cmp_ps(
735  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
736  fzeroes,
737  _CMP_NEQ_UQ);
738  condition2 = _mm256_cmp_ps(
739  _mm256_cmp_ps(
740  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
741  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
742  _CMP_NEQ_UQ);
743  // Need this condition only for cos
744  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
745  // twos), fours)), fzeroes);
746 
747  sine =
748  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
749  sine = _mm256_sub_ps(
750  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
751  _mm256_storeu_ps(bPtr, sine);
752  aPtr += 8;
753  bPtr += 8;
754  }
755 
756  number = eighthPoints * 8;
757  for (; number < num_points; number++) {
758  *bPtr++ = sin(*aPtr++);
759  }
760 }
761 
762 #endif /* LV_HAVE_AVX2 for unaligned */
763 
764 
765 #ifdef LV_HAVE_SSE4_1
766 #include <smmintrin.h>
767 
768 static inline void
769 volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
770 {
771  float* bPtr = bVector;
772  const float* aPtr = aVector;
773 
774  unsigned int number = 0;
775  unsigned int quarterPoints = num_points / 4;
776  unsigned int i = 0;
777 
778  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
779  fzeroes;
780  __m128 sine, cosine, condition1, condition2;
781  __m128i q, r, ones, twos, fours;
782 
783  m4pi = _mm_set1_ps(1.273239545);
784  pio4A = _mm_set1_ps(0.78515625);
785  pio4B = _mm_set1_ps(0.241876e-3);
786  ffours = _mm_set1_ps(4.0);
787  ftwos = _mm_set1_ps(2.0);
788  fones = _mm_set1_ps(1.0);
789  fzeroes = _mm_setzero_ps();
790  ones = _mm_set1_epi32(1);
791  twos = _mm_set1_epi32(2);
792  fours = _mm_set1_epi32(4);
793 
794  cp1 = _mm_set1_ps(1.0);
795  cp2 = _mm_set1_ps(0.83333333e-1);
796  cp3 = _mm_set1_ps(0.2777778e-2);
797  cp4 = _mm_set1_ps(0.49603e-4);
798  cp5 = _mm_set1_ps(0.551e-6);
799 
800  for (; number < quarterPoints; number++) {
801  aVal = _mm_loadu_ps(aPtr);
802  s = _mm_sub_ps(aVal,
803  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
804  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
805  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
806 
807  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
808  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
809 
810  s = _mm_div_ps(
811  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
812  s = _mm_mul_ps(s, s);
813  // Evaluate Taylor series
814  s = _mm_mul_ps(
815  _mm_add_ps(
816  _mm_mul_ps(
817  _mm_sub_ps(
818  _mm_mul_ps(
819  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
820  cp3),
821  s),
822  cp2),
823  s),
824  cp1),
825  s);
826 
827  for (i = 0; i < 3; i++) {
828  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
829  }
830  s = _mm_div_ps(s, ftwos);
831 
832  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
833  cosine = _mm_sub_ps(fones, s);
834 
835  condition1 = _mm_cmpneq_ps(
836  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
837  condition2 = _mm_cmpneq_ps(
838  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
839  _mm_cmplt_ps(aVal, fzeroes));
840 
841  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
842  sine =
843  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
844  _mm_storeu_ps(bPtr, sine);
845  aPtr += 4;
846  bPtr += 4;
847  }
848 
849  number = quarterPoints * 4;
850  for (; number < num_points; number++) {
851  *bPtr++ = sinf(*aPtr++);
852  }
853 }
854 
855 #endif /* LV_HAVE_SSE4_1 for unaligned */
856 
857 
858 #ifdef LV_HAVE_GENERIC
859 
860 static inline void
861 volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
862 {
863  float* bPtr = bVector;
864  const float* aPtr = aVector;
865  unsigned int number = 0;
866 
867  for (number = 0; number < num_points; number++) {
868  *bPtr++ = sinf(*aPtr++);
869  }
870 }
871 
872 #endif /* LV_HAVE_GENERIC */
873 
874 
875 #ifdef LV_HAVE_NEON
876 #include <arm_neon.h>
878 
879 static inline void
880 volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
881 {
882  unsigned int number = 0;
883  unsigned int quarter_points = num_points / 4;
884  float* bVectorPtr = bVector;
885  const float* aVectorPtr = aVector;
886 
887  float32x4_t b_vec;
888  float32x4_t a_vec;
889 
890  for (number = 0; number < quarter_points; number++) {
891  a_vec = vld1q_f32(aVectorPtr);
892  // Prefetch next one, speeds things up
893  __VOLK_PREFETCH(aVectorPtr + 4);
894  b_vec = _vsinq_f32(a_vec);
895  vst1q_f32(bVectorPtr, b_vec);
896  // move pointers ahead
897  bVectorPtr += 4;
898  aVectorPtr += 4;
899  }
900 
901  // Deal with the rest
902  for (number = quarter_points * 4; number < num_points; number++) {
903  *bVectorPtr++ = sinf(*aVectorPtr++);
904  }
905 }
906 
907 #endif /* LV_HAVE_NEON */
908 
909 
910 #endif /* INCLUDED_volk_32f_sin_32f_u_H */
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:861
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:880
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:262