76 #ifndef INCLUDED_volk_32f_cos_32f_a_H
77 #define INCLUDED_volk_32f_cos_32f_a_H
79 #ifdef LV_HAVE_AVX512F
81 #include <immintrin.h>
82 static inline void volk_32f_cos_32f_a_avx512f(
float* cosVector,
83 const float* inVector,
84 unsigned int num_points)
86 float* cosPtr = cosVector;
87 const float* inPtr = inVector;
89 unsigned int number = 0;
90 unsigned int sixteenPoints = num_points / 16;
93 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
95 __m512i q, zeros, ones, twos, fours;
97 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
98 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
99 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
100 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
101 ffours = _mm512_set1_ps(4.0);
102 ftwos = _mm512_set1_ps(2.0);
103 fones = _mm512_set1_ps(1.0);
104 zeros = _mm512_setzero_epi32();
105 ones = _mm512_set1_epi32(1);
106 twos = _mm512_set1_epi32(2);
107 fours = _mm512_set1_epi32(4);
109 cp1 = _mm512_set1_ps(1.0);
110 cp2 = _mm512_set1_ps(0.08333333333333333);
111 cp3 = _mm512_set1_ps(0.002777777777777778);
112 cp4 = _mm512_set1_ps(4.96031746031746e-05);
113 cp5 = _mm512_set1_ps(5.511463844797178e-07);
114 __mmask16 condition1, condition2;
116 for (; number < sixteenPoints; number++) {
117 aVal = _mm512_load_ps(inPtr);
119 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
122 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
124 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
126 s = _mm512_fnmadd_ps(r, pio4A, s);
127 s = _mm512_fnmadd_ps(r, pio4B, s);
128 s = _mm512_fnmadd_ps(r, pio4C, s);
132 _mm512_set1_ps(8.0f));
133 s = _mm512_mul_ps(s, s);
138 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
143 for (
i = 0;
i < 3;
i++)
144 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
145 s = _mm512_div_ps(s, ftwos);
147 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
148 cosine = _mm512_sub_ps(fones, s);
151 condition1 = _mm512_cmpneq_epi32_mask(
152 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
155 condition2 = _mm512_cmpneq_epi32_mask(
156 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
157 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
158 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
159 _mm512_store_ps(cosPtr, cosine);
164 number = sixteenPoints * 16;
165 for (; number < num_points; number++) {
166 *cosPtr++ = cosf(*inPtr++);
171 #if LV_HAVE_AVX2 && LV_HAVE_FMA
172 #include <immintrin.h>
175 volk_32f_cos_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
177 float* bPtr = bVector;
178 const float* aPtr = aVector;
180 unsigned int number = 0;
181 unsigned int eighthPoints = num_points / 8;
184 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
187 __m256i q, ones, twos, fours;
189 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
190 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
191 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
192 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
193 ffours = _mm256_set1_ps(4.0);
194 ftwos = _mm256_set1_ps(2.0);
195 fones = _mm256_set1_ps(1.0);
196 fzeroes = _mm256_setzero_ps();
197 __m256i zeroes = _mm256_set1_epi32(0);
198 ones = _mm256_set1_epi32(1);
199 __m256i allones = _mm256_set1_epi32(0xffffffff);
200 twos = _mm256_set1_epi32(2);
201 fours = _mm256_set1_epi32(4);
203 cp1 = _mm256_set1_ps(1.0);
204 cp2 = _mm256_set1_ps(0.08333333333333333);
205 cp3 = _mm256_set1_ps(0.002777777777777778);
206 cp4 = _mm256_set1_ps(4.96031746031746e-05);
207 cp5 = _mm256_set1_ps(5.511463844797178e-07);
211 for (; number < eighthPoints; number++) {
213 aVal = _mm256_load_ps(aPtr);
215 s = _mm256_sub_ps(aVal,
216 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
217 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
219 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
221 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
223 s = _mm256_fnmadd_ps(r, pio4A, s);
224 s = _mm256_fnmadd_ps(r, pio4B, s);
225 s = _mm256_fnmadd_ps(r, pio4C, s);
229 _mm256_set1_ps(8.0));
230 s = _mm256_mul_ps(s, s);
235 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
240 for (
i = 0;
i < 3;
i++)
241 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
242 s = _mm256_div_ps(s, ftwos);
244 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
245 cosine = _mm256_sub_ps(fones, s);
249 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
250 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
253 condition3.int_vec = _mm256_cmpeq_epi32(
254 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
255 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
257 cosine = _mm256_add_ps(
258 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
259 cosine = _mm256_sub_ps(cosine,
260 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
261 condition3.float_vec));
262 _mm256_store_ps(bPtr, cosine);
267 number = eighthPoints * 8;
268 for (; number < num_points; number++) {
269 *bPtr++ = cos(*aPtr++);
276 #include <immintrin.h>
279 volk_32f_cos_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
281 float* bPtr = bVector;
282 const float* aPtr = aVector;
284 unsigned int number = 0;
285 unsigned int eighthPoints = num_points / 8;
288 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
291 __m256i q, ones, twos, fours;
293 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
294 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
295 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
296 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
297 ffours = _mm256_set1_ps(4.0);
298 ftwos = _mm256_set1_ps(2.0);
299 fones = _mm256_set1_ps(1.0);
300 fzeroes = _mm256_setzero_ps();
301 __m256i zeroes = _mm256_set1_epi32(0);
302 ones = _mm256_set1_epi32(1);
303 __m256i allones = _mm256_set1_epi32(0xffffffff);
304 twos = _mm256_set1_epi32(2);
305 fours = _mm256_set1_epi32(4);
307 cp1 = _mm256_set1_ps(1.0);
308 cp2 = _mm256_set1_ps(0.08333333333333333);
309 cp3 = _mm256_set1_ps(0.002777777777777778);
310 cp4 = _mm256_set1_ps(4.96031746031746e-05);
311 cp5 = _mm256_set1_ps(5.511463844797178e-07);
315 for (; number < eighthPoints; number++) {
317 aVal = _mm256_load_ps(aPtr);
319 s = _mm256_sub_ps(aVal,
320 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
321 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
323 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
325 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
327 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
328 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
329 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
333 _mm256_set1_ps(8.0));
334 s = _mm256_mul_ps(s, s);
342 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
351 for (
i = 0;
i < 3;
i++)
352 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
353 s = _mm256_div_ps(s, ftwos);
355 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
356 cosine = _mm256_sub_ps(fones, s);
360 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
361 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
364 condition3.int_vec = _mm256_cmpeq_epi32(
365 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
366 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
368 cosine = _mm256_add_ps(
369 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
370 cosine = _mm256_sub_ps(cosine,
371 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
372 condition3.float_vec));
373 _mm256_store_ps(bPtr, cosine);
378 number = eighthPoints * 8;
379 for (; number < num_points; number++) {
380 *bPtr++ = cos(*aPtr++);
386 #ifdef LV_HAVE_SSE4_1
387 #include <smmintrin.h>
390 volk_32f_cos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
392 float* bPtr = bVector;
393 const float* aPtr = aVector;
395 unsigned int number = 0;
396 unsigned int quarterPoints = num_points / 4;
399 __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
402 __m128i q, ones, twos, fours;
404 m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
405 pio4A = _mm_set1_ps(0.7853981554508209228515625);
406 pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
407 pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
408 ffours = _mm_set1_ps(4.0);
409 ftwos = _mm_set1_ps(2.0);
410 fones = _mm_set1_ps(1.0);
411 fzeroes = _mm_setzero_ps();
412 __m128i zeroes = _mm_set1_epi32(0);
413 ones = _mm_set1_epi32(1);
414 __m128i allones = _mm_set1_epi32(0xffffffff);
415 twos = _mm_set1_epi32(2);
416 fours = _mm_set1_epi32(4);
418 cp1 = _mm_set1_ps(1.0);
419 cp2 = _mm_set1_ps(0.08333333333333333);
420 cp3 = _mm_set1_ps(0.002777777777777778);
421 cp4 = _mm_set1_ps(4.96031746031746e-05);
422 cp5 = _mm_set1_ps(5.511463844797178e-07);
426 for (; number < quarterPoints; number++) {
428 aVal = _mm_load_ps(aPtr);
431 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
433 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
435 r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
437 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
438 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
439 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
442 s, _mm_set1_ps(8.0));
443 s = _mm_mul_ps(s, s);
450 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
458 for (
i = 0;
i < 3;
i++)
459 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
460 s = _mm_div_ps(s, ftwos);
462 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
463 cosine = _mm_sub_ps(fones, s);
467 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
468 condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
472 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
473 condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
475 cosine = _mm_add_ps(cosine,
476 _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
479 _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
480 _mm_store_ps(bPtr, cosine);
485 number = quarterPoints * 4;
486 for (; number < num_points; number++) {
487 *bPtr++ = cosf(*aPtr++);
496 #ifndef INCLUDED_volk_32f_cos_32f_u_H
497 #define INCLUDED_volk_32f_cos_32f_u_H
499 #ifdef LV_HAVE_AVX512F
501 #include <immintrin.h>
502 static inline void volk_32f_cos_32f_u_avx512f(
float* cosVector,
503 const float* inVector,
504 unsigned int num_points)
506 float* cosPtr = cosVector;
507 const float* inPtr = inVector;
509 unsigned int number = 0;
510 unsigned int sixteenPoints = num_points / 16;
513 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
515 __m512i q, zeros, ones, twos, fours;
517 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
518 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
519 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
520 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
521 ffours = _mm512_set1_ps(4.0);
522 ftwos = _mm512_set1_ps(2.0);
523 fones = _mm512_set1_ps(1.0);
524 zeros = _mm512_setzero_epi32();
525 ones = _mm512_set1_epi32(1);
526 twos = _mm512_set1_epi32(2);
527 fours = _mm512_set1_epi32(4);
529 cp1 = _mm512_set1_ps(1.0);
530 cp2 = _mm512_set1_ps(0.08333333333333333);
531 cp3 = _mm512_set1_ps(0.002777777777777778);
532 cp4 = _mm512_set1_ps(4.96031746031746e-05);
533 cp5 = _mm512_set1_ps(5.511463844797178e-07);
534 __mmask16 condition1, condition2;
535 for (; number < sixteenPoints; number++) {
536 aVal = _mm512_loadu_ps(inPtr);
538 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
541 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
543 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
545 s = _mm512_fnmadd_ps(r, pio4A, s);
546 s = _mm512_fnmadd_ps(r, pio4B, s);
547 s = _mm512_fnmadd_ps(r, pio4C, s);
551 _mm512_set1_ps(8.0f));
552 s = _mm512_mul_ps(s, s);
557 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
562 for (
i = 0;
i < 3;
i++)
563 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
564 s = _mm512_div_ps(s, ftwos);
566 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
567 cosine = _mm512_sub_ps(fones, s);
570 condition1 = _mm512_cmpneq_epi32_mask(
571 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
574 condition2 = _mm512_cmpneq_epi32_mask(
575 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
577 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
578 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
579 _mm512_storeu_ps(cosPtr, cosine);
584 number = sixteenPoints * 16;
585 for (; number < num_points; number++) {
586 *cosPtr++ = cosf(*inPtr++);
591 #if LV_HAVE_AVX2 && LV_HAVE_FMA
592 #include <immintrin.h>
595 volk_32f_cos_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
597 float* bPtr = bVector;
598 const float* aPtr = aVector;
600 unsigned int number = 0;
601 unsigned int eighthPoints = num_points / 8;
604 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
607 __m256i q, ones, twos, fours;
609 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
610 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
611 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
612 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
613 ffours = _mm256_set1_ps(4.0);
614 ftwos = _mm256_set1_ps(2.0);
615 fones = _mm256_set1_ps(1.0);
616 fzeroes = _mm256_setzero_ps();
617 __m256i zeroes = _mm256_set1_epi32(0);
618 ones = _mm256_set1_epi32(1);
619 __m256i allones = _mm256_set1_epi32(0xffffffff);
620 twos = _mm256_set1_epi32(2);
621 fours = _mm256_set1_epi32(4);
623 cp1 = _mm256_set1_ps(1.0);
624 cp2 = _mm256_set1_ps(0.08333333333333333);
625 cp3 = _mm256_set1_ps(0.002777777777777778);
626 cp4 = _mm256_set1_ps(4.96031746031746e-05);
627 cp5 = _mm256_set1_ps(5.511463844797178e-07);
631 for (; number < eighthPoints; number++) {
633 aVal = _mm256_loadu_ps(aPtr);
635 s = _mm256_sub_ps(aVal,
636 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
637 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
639 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
641 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
643 s = _mm256_fnmadd_ps(r, pio4A, s);
644 s = _mm256_fnmadd_ps(r, pio4B, s);
645 s = _mm256_fnmadd_ps(r, pio4C, s);
649 _mm256_set1_ps(8.0));
650 s = _mm256_mul_ps(s, s);
655 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
660 for (
i = 0;
i < 3;
i++)
661 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
662 s = _mm256_div_ps(s, ftwos);
664 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
665 cosine = _mm256_sub_ps(fones, s);
669 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
670 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
673 condition3.int_vec = _mm256_cmpeq_epi32(
674 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
675 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
677 cosine = _mm256_add_ps(
678 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
679 cosine = _mm256_sub_ps(cosine,
680 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
681 condition3.float_vec));
682 _mm256_storeu_ps(bPtr, cosine);
687 number = eighthPoints * 8;
688 for (; number < num_points; number++) {
689 *bPtr++ = cos(*aPtr++);
696 #include <immintrin.h>
699 volk_32f_cos_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
701 float* bPtr = bVector;
702 const float* aPtr = aVector;
704 unsigned int number = 0;
705 unsigned int eighthPoints = num_points / 8;
708 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
711 __m256i q, ones, twos, fours;
713 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
714 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
715 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
716 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
717 ffours = _mm256_set1_ps(4.0);
718 ftwos = _mm256_set1_ps(2.0);
719 fones = _mm256_set1_ps(1.0);
720 fzeroes = _mm256_setzero_ps();
721 __m256i zeroes = _mm256_set1_epi32(0);
722 ones = _mm256_set1_epi32(1);
723 __m256i allones = _mm256_set1_epi32(0xffffffff);
724 twos = _mm256_set1_epi32(2);
725 fours = _mm256_set1_epi32(4);
727 cp1 = _mm256_set1_ps(1.0);
728 cp2 = _mm256_set1_ps(0.08333333333333333);
729 cp3 = _mm256_set1_ps(0.002777777777777778);
730 cp4 = _mm256_set1_ps(4.96031746031746e-05);
731 cp5 = _mm256_set1_ps(5.511463844797178e-07);
735 for (; number < eighthPoints; number++) {
737 aVal = _mm256_loadu_ps(aPtr);
739 s = _mm256_sub_ps(aVal,
740 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
741 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
743 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
745 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
747 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
748 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
749 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
753 _mm256_set1_ps(8.0));
754 s = _mm256_mul_ps(s, s);
762 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
771 for (
i = 0;
i < 3;
i++)
772 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
773 s = _mm256_div_ps(s, ftwos);
775 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
776 cosine = _mm256_sub_ps(fones, s);
780 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
781 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
784 condition3.int_vec = _mm256_cmpeq_epi32(
785 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
786 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
788 cosine = _mm256_add_ps(
789 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
790 cosine = _mm256_sub_ps(cosine,
791 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
792 condition3.float_vec));
793 _mm256_storeu_ps(bPtr, cosine);
798 number = eighthPoints * 8;
799 for (; number < num_points; number++) {
800 *bPtr++ = cos(*aPtr++);
806 #ifdef LV_HAVE_SSE4_1
807 #include <smmintrin.h>
810 volk_32f_cos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
812 float* bPtr = bVector;
813 const float* aPtr = aVector;
815 unsigned int number = 0;
816 unsigned int quarterPoints = num_points / 4;
819 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
821 __m128 sine, cosine, condition1, condition3;
822 __m128i q, r, ones, twos, fours;
824 m4pi = _mm_set1_ps(1.273239545);
825 pio4A = _mm_set1_ps(0.78515625);
826 pio4B = _mm_set1_ps(0.241876e-3);
827 ffours = _mm_set1_ps(4.0);
828 ftwos = _mm_set1_ps(2.0);
829 fones = _mm_set1_ps(1.0);
830 fzeroes = _mm_setzero_ps();
831 ones = _mm_set1_epi32(1);
832 twos = _mm_set1_epi32(2);
833 fours = _mm_set1_epi32(4);
835 cp1 = _mm_set1_ps(1.0);
836 cp2 = _mm_set1_ps(0.83333333e-1);
837 cp3 = _mm_set1_ps(0.2777778e-2);
838 cp4 = _mm_set1_ps(0.49603e-4);
839 cp5 = _mm_set1_ps(0.551e-6);
841 for (; number < quarterPoints; number++) {
842 aVal = _mm_loadu_ps(aPtr);
844 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
845 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
846 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
848 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
849 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
852 s, _mm_set1_ps(8.0));
853 s = _mm_mul_ps(s, s);
860 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
868 for (
i = 0;
i < 3;
i++) {
869 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
871 s = _mm_div_ps(s, ftwos);
873 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
874 cosine = _mm_sub_ps(fones, s);
876 condition1 = _mm_cmpneq_ps(
877 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
879 condition3 = _mm_cmpneq_ps(
880 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
882 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
884 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
885 _mm_storeu_ps(bPtr, cosine);
890 number = quarterPoints * 4;
891 for (; number < num_points; number++) {
892 *bPtr++ = cosf(*aPtr++);
899 #ifdef LV_HAVE_GENERIC
907 const float* aVector,
908 unsigned int num_points)
910 float* bPtr = bVector;
911 const float* aPtr = aVector;
913 float m4pi = 1.273239544735162542821171882678754627704620361328125;
914 float pio4A = 0.7853981554508209228515625;
915 float pio4B = 0.794662735614792836713604629039764404296875e-8;
916 float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
920 for (number = 0; number < num_points; number++) {
921 float s = fabs(*aPtr);
922 int q = (int)(s * m4pi);
930 s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
935 for (
i = 0;
i < N; ++
i) {
940 float sine = sqrt((2.0 - s) * s);
941 float cosine = 1 - s;
943 if (((q + 1) & 2) != 0) {
948 if (((q + 2) & 4) != 0) {
960 #ifdef LV_HAVE_GENERIC
965 float* bPtr = bVector;
966 const float* aPtr = aVector;
967 unsigned int number = 0;
969 for (; number < num_points; number++) {
970 *bPtr++ = cosf(*aPtr++);
978 #include <arm_neon.h>
984 unsigned int number = 0;
985 unsigned int quarter_points = num_points / 4;
986 float* bVectorPtr = bVector;
987 const float* aVectorPtr = aVector;
992 for (number = 0; number < quarter_points; number++) {
993 a_vec = vld1q_f32(aVectorPtr);
997 vst1q_f32(bVectorPtr, b_vec);
1004 for (number = quarter_points * 4; number < num_points; number++) {
1005 *bVectorPtr++ = cosf(*aVectorPtr++);
Definition: volk_common.h:111
Definition: volk_common.h:128
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:963
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:982
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:906
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:268