76 #ifndef INCLUDED_volk_32f_sin_32f_a_H
77 #define INCLUDED_volk_32f_sin_32f_a_H
78 #ifdef LV_HAVE_AVX512F
80 #include <immintrin.h>
81 static inline void volk_32f_sin_32f_a_avx512f(
float* sinVector,
82 const float* inVector,
83 unsigned int num_points)
85 float* sinPtr = sinVector;
86 const float* inPtr = inVector;
88 unsigned int number = 0;
89 unsigned int sixteenPoints = num_points / 16;
92 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
95 __m512i q, zeros, ones, twos, fours;
97 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
98 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
99 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
100 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
101 ffours = _mm512_set1_ps(4.0);
102 ftwos = _mm512_set1_ps(2.0);
103 fones = _mm512_set1_ps(1.0);
104 zeros = _mm512_setzero_epi32();
105 ones = _mm512_set1_epi32(1);
106 twos = _mm512_set1_epi32(2);
107 fours = _mm512_set1_epi32(4);
109 cp1 = _mm512_set1_ps(1.0);
110 cp2 = _mm512_set1_ps(0.08333333333333333);
111 cp3 = _mm512_set1_ps(0.002777777777777778);
112 cp4 = _mm512_set1_ps(4.96031746031746e-05);
113 cp5 = _mm512_set1_ps(5.511463844797178e-07);
114 __mmask16 condition1, condition2, ltZero;
116 for (; number < sixteenPoints; number++) {
117 aVal = _mm512_load_ps(inPtr);
119 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
122 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
124 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
126 s = _mm512_fnmadd_ps(r, pio4A, s);
127 s = _mm512_fnmadd_ps(r, pio4B, s);
128 s = _mm512_fnmadd_ps(r, pio4C, s);
132 _mm512_set1_ps(8.0f));
133 s = _mm512_mul_ps(s, s);
138 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
143 for (
i = 0;
i < 3;
i++)
144 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
145 s = _mm512_div_ps(s, ftwos);
147 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
148 cosine = _mm512_sub_ps(fones, s);
150 condition1 = _mm512_cmpneq_epi32_mask(
151 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
152 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
153 condition2 = _mm512_kxor(
154 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
156 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
157 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
158 _mm512_store_ps(sinPtr, sine);
163 number = sixteenPoints * 16;
164 for (; number < num_points; number++) {
165 *sinPtr++ = sinf(*inPtr++);
169 #if LV_HAVE_AVX2 && LV_HAVE_FMA
170 #include <immintrin.h>
173 volk_32f_sin_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
175 float* bPtr = bVector;
176 const float* aPtr = aVector;
178 unsigned int number = 0;
179 unsigned int eighthPoints = num_points / 8;
182 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
184 __m256 sine, cosine, condition1, condition2;
185 __m256i q, r, ones, twos, fours;
187 m4pi = _mm256_set1_ps(1.273239545);
188 pio4A = _mm256_set1_ps(0.78515625);
189 pio4B = _mm256_set1_ps(0.241876e-3);
190 ffours = _mm256_set1_ps(4.0);
191 ftwos = _mm256_set1_ps(2.0);
192 fones = _mm256_set1_ps(1.0);
193 fzeroes = _mm256_setzero_ps();
194 ones = _mm256_set1_epi32(1);
195 twos = _mm256_set1_epi32(2);
196 fours = _mm256_set1_epi32(4);
198 cp1 = _mm256_set1_ps(1.0);
199 cp2 = _mm256_set1_ps(0.83333333e-1);
200 cp3 = _mm256_set1_ps(0.2777778e-2);
201 cp4 = _mm256_set1_ps(0.49603e-4);
202 cp5 = _mm256_set1_ps(0.551e-6);
204 for (; number < eighthPoints; number++) {
205 aVal = _mm256_load_ps(aPtr);
206 s = _mm256_sub_ps(aVal,
207 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
208 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
209 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
210 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
212 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
213 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
217 _mm256_set1_ps(8.0));
218 s = _mm256_mul_ps(s, s);
223 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
228 for (
i = 0;
i < 3;
i++) {
229 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
231 s = _mm256_div_ps(s, ftwos);
233 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
234 cosine = _mm256_sub_ps(fones, s);
236 condition1 = _mm256_cmp_ps(
237 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
240 condition2 = _mm256_cmp_ps(
242 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
243 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
250 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
251 sine = _mm256_sub_ps(
252 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
253 _mm256_store_ps(bPtr, sine);
258 number = eighthPoints * 8;
259 for (; number < num_points; number++) {
260 *bPtr++ = sin(*aPtr++);
267 #include <immintrin.h>
270 volk_32f_sin_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
272 float* bPtr = bVector;
273 const float* aPtr = aVector;
275 unsigned int number = 0;
276 unsigned int eighthPoints = num_points / 8;
279 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
281 __m256 sine, cosine, condition1, condition2;
282 __m256i q, r, ones, twos, fours;
284 m4pi = _mm256_set1_ps(1.273239545);
285 pio4A = _mm256_set1_ps(0.78515625);
286 pio4B = _mm256_set1_ps(0.241876e-3);
287 ffours = _mm256_set1_ps(4.0);
288 ftwos = _mm256_set1_ps(2.0);
289 fones = _mm256_set1_ps(1.0);
290 fzeroes = _mm256_setzero_ps();
291 ones = _mm256_set1_epi32(1);
292 twos = _mm256_set1_epi32(2);
293 fours = _mm256_set1_epi32(4);
295 cp1 = _mm256_set1_ps(1.0);
296 cp2 = _mm256_set1_ps(0.83333333e-1);
297 cp3 = _mm256_set1_ps(0.2777778e-2);
298 cp4 = _mm256_set1_ps(0.49603e-4);
299 cp5 = _mm256_set1_ps(0.551e-6);
301 for (; number < eighthPoints; number++) {
302 aVal = _mm256_load_ps(aPtr);
303 s = _mm256_sub_ps(aVal,
304 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
305 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
306 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
307 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
309 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
310 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
314 _mm256_set1_ps(8.0));
315 s = _mm256_mul_ps(s, s);
323 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
332 for (
i = 0;
i < 3;
i++) {
333 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
335 s = _mm256_div_ps(s, ftwos);
337 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
338 cosine = _mm256_sub_ps(fones, s);
340 condition1 = _mm256_cmp_ps(
341 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
344 condition2 = _mm256_cmp_ps(
346 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
347 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
354 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
355 sine = _mm256_sub_ps(
356 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
357 _mm256_store_ps(bPtr, sine);
362 number = eighthPoints * 8;
363 for (; number < num_points; number++) {
364 *bPtr++ = sin(*aPtr++);
370 #ifdef LV_HAVE_SSE4_1
371 #include <smmintrin.h>
374 volk_32f_sin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
376 float* bPtr = bVector;
377 const float* aPtr = aVector;
379 unsigned int number = 0;
380 unsigned int quarterPoints = num_points / 4;
383 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
385 __m128 sine, cosine, condition1, condition2;
386 __m128i q, r, ones, twos, fours;
388 m4pi = _mm_set1_ps(1.273239545);
389 pio4A = _mm_set1_ps(0.78515625);
390 pio4B = _mm_set1_ps(0.241876e-3);
391 ffours = _mm_set1_ps(4.0);
392 ftwos = _mm_set1_ps(2.0);
393 fones = _mm_set1_ps(1.0);
394 fzeroes = _mm_setzero_ps();
395 ones = _mm_set1_epi32(1);
396 twos = _mm_set1_epi32(2);
397 fours = _mm_set1_epi32(4);
399 cp1 = _mm_set1_ps(1.0);
400 cp2 = _mm_set1_ps(0.83333333e-1);
401 cp3 = _mm_set1_ps(0.2777778e-2);
402 cp4 = _mm_set1_ps(0.49603e-4);
403 cp5 = _mm_set1_ps(0.551e-6);
405 for (; number < quarterPoints; number++) {
406 aVal = _mm_load_ps(aPtr);
408 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
409 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
410 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
412 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
413 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
416 s, _mm_set1_ps(8.0));
417 s = _mm_mul_ps(s, s);
424 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
432 for (
i = 0;
i < 3;
i++) {
433 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
435 s = _mm_div_ps(s, ftwos);
437 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
438 cosine = _mm_sub_ps(fones, s);
440 condition1 = _mm_cmpneq_ps(
441 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
442 condition2 = _mm_cmpneq_ps(
443 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
444 _mm_cmplt_ps(aVal, fzeroes));
449 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
451 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
452 _mm_store_ps(bPtr, sine);
457 number = quarterPoints * 4;
458 for (; number < num_points; number++) {
459 *bPtr++ = sinf(*aPtr++);
468 #ifndef INCLUDED_volk_32f_sin_32f_u_H
469 #define INCLUDED_volk_32f_sin_32f_u_H
471 #ifdef LV_HAVE_AVX512F
473 #include <immintrin.h>
474 static inline void volk_32f_sin_32f_u_avx512f(
float* sinVector,
475 const float* inVector,
476 unsigned int num_points)
478 float* sinPtr = sinVector;
479 const float* inPtr = inVector;
481 unsigned int number = 0;
482 unsigned int sixteenPoints = num_points / 16;
485 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
488 __m512i q, zeros, ones, twos, fours;
490 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
491 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
492 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
493 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
494 ffours = _mm512_set1_ps(4.0);
495 ftwos = _mm512_set1_ps(2.0);
496 fones = _mm512_set1_ps(1.0);
497 zeros = _mm512_setzero_epi32();
498 ones = _mm512_set1_epi32(1);
499 twos = _mm512_set1_epi32(2);
500 fours = _mm512_set1_epi32(4);
502 cp1 = _mm512_set1_ps(1.0);
503 cp2 = _mm512_set1_ps(0.08333333333333333);
504 cp3 = _mm512_set1_ps(0.002777777777777778);
505 cp4 = _mm512_set1_ps(4.96031746031746e-05);
506 cp5 = _mm512_set1_ps(5.511463844797178e-07);
507 __mmask16 condition1, condition2, ltZero;
509 for (; number < sixteenPoints; number++) {
510 aVal = _mm512_loadu_ps(inPtr);
512 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
515 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
517 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
519 s = _mm512_fnmadd_ps(r, pio4A, s);
520 s = _mm512_fnmadd_ps(r, pio4B, s);
521 s = _mm512_fnmadd_ps(r, pio4C, s);
525 _mm512_set1_ps(8.0f));
526 s = _mm512_mul_ps(s, s);
531 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
536 for (
i = 0;
i < 3;
i++)
537 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
538 s = _mm512_div_ps(s, ftwos);
540 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
541 cosine = _mm512_sub_ps(fones, s);
543 condition1 = _mm512_cmpneq_epi32_mask(
544 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
545 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
546 condition2 = _mm512_kxor(
547 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
549 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
550 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
551 _mm512_storeu_ps(sinPtr, sine);
556 number = sixteenPoints * 16;
557 for (; number < num_points; number++) {
558 *sinPtr++ = sinf(*inPtr++);
563 #if LV_HAVE_AVX2 && LV_HAVE_FMA
564 #include <immintrin.h>
567 volk_32f_sin_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
569 float* bPtr = bVector;
570 const float* aPtr = aVector;
572 unsigned int number = 0;
573 unsigned int eighthPoints = num_points / 8;
576 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
578 __m256 sine, cosine, condition1, condition2;
579 __m256i q, r, ones, twos, fours;
581 m4pi = _mm256_set1_ps(1.273239545);
582 pio4A = _mm256_set1_ps(0.78515625);
583 pio4B = _mm256_set1_ps(0.241876e-3);
584 ffours = _mm256_set1_ps(4.0);
585 ftwos = _mm256_set1_ps(2.0);
586 fones = _mm256_set1_ps(1.0);
587 fzeroes = _mm256_setzero_ps();
588 ones = _mm256_set1_epi32(1);
589 twos = _mm256_set1_epi32(2);
590 fours = _mm256_set1_epi32(4);
592 cp1 = _mm256_set1_ps(1.0);
593 cp2 = _mm256_set1_ps(0.83333333e-1);
594 cp3 = _mm256_set1_ps(0.2777778e-2);
595 cp4 = _mm256_set1_ps(0.49603e-4);
596 cp5 = _mm256_set1_ps(0.551e-6);
598 for (; number < eighthPoints; number++) {
599 aVal = _mm256_loadu_ps(aPtr);
600 s = _mm256_sub_ps(aVal,
601 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
602 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
603 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
604 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
606 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
607 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
611 _mm256_set1_ps(8.0));
612 s = _mm256_mul_ps(s, s);
617 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
622 for (
i = 0;
i < 3;
i++) {
623 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
625 s = _mm256_div_ps(s, ftwos);
627 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
628 cosine = _mm256_sub_ps(fones, s);
630 condition1 = _mm256_cmp_ps(
631 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
634 condition2 = _mm256_cmp_ps(
636 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
637 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
644 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
645 sine = _mm256_sub_ps(
646 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
647 _mm256_storeu_ps(bPtr, sine);
652 number = eighthPoints * 8;
653 for (; number < num_points; number++) {
654 *bPtr++ = sin(*aPtr++);
661 #include <immintrin.h>
664 volk_32f_sin_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
666 float* bPtr = bVector;
667 const float* aPtr = aVector;
669 unsigned int number = 0;
670 unsigned int eighthPoints = num_points / 8;
673 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
675 __m256 sine, cosine, condition1, condition2;
676 __m256i q, r, ones, twos, fours;
678 m4pi = _mm256_set1_ps(1.273239545);
679 pio4A = _mm256_set1_ps(0.78515625);
680 pio4B = _mm256_set1_ps(0.241876e-3);
681 ffours = _mm256_set1_ps(4.0);
682 ftwos = _mm256_set1_ps(2.0);
683 fones = _mm256_set1_ps(1.0);
684 fzeroes = _mm256_setzero_ps();
685 ones = _mm256_set1_epi32(1);
686 twos = _mm256_set1_epi32(2);
687 fours = _mm256_set1_epi32(4);
689 cp1 = _mm256_set1_ps(1.0);
690 cp2 = _mm256_set1_ps(0.83333333e-1);
691 cp3 = _mm256_set1_ps(0.2777778e-2);
692 cp4 = _mm256_set1_ps(0.49603e-4);
693 cp5 = _mm256_set1_ps(0.551e-6);
695 for (; number < eighthPoints; number++) {
696 aVal = _mm256_loadu_ps(aPtr);
697 s = _mm256_sub_ps(aVal,
698 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
699 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
700 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
701 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
703 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
704 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
708 _mm256_set1_ps(8.0));
709 s = _mm256_mul_ps(s, s);
717 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
726 for (
i = 0;
i < 3;
i++) {
727 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
729 s = _mm256_div_ps(s, ftwos);
731 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
732 cosine = _mm256_sub_ps(fones, s);
734 condition1 = _mm256_cmp_ps(
735 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
738 condition2 = _mm256_cmp_ps(
740 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
741 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
748 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
749 sine = _mm256_sub_ps(
750 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
751 _mm256_storeu_ps(bPtr, sine);
756 number = eighthPoints * 8;
757 for (; number < num_points; number++) {
758 *bPtr++ = sin(*aPtr++);
765 #ifdef LV_HAVE_SSE4_1
766 #include <smmintrin.h>
769 volk_32f_sin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
771 float* bPtr = bVector;
772 const float* aPtr = aVector;
774 unsigned int number = 0;
775 unsigned int quarterPoints = num_points / 4;
778 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
780 __m128 sine, cosine, condition1, condition2;
781 __m128i q, r, ones, twos, fours;
783 m4pi = _mm_set1_ps(1.273239545);
784 pio4A = _mm_set1_ps(0.78515625);
785 pio4B = _mm_set1_ps(0.241876e-3);
786 ffours = _mm_set1_ps(4.0);
787 ftwos = _mm_set1_ps(2.0);
788 fones = _mm_set1_ps(1.0);
789 fzeroes = _mm_setzero_ps();
790 ones = _mm_set1_epi32(1);
791 twos = _mm_set1_epi32(2);
792 fours = _mm_set1_epi32(4);
794 cp1 = _mm_set1_ps(1.0);
795 cp2 = _mm_set1_ps(0.83333333e-1);
796 cp3 = _mm_set1_ps(0.2777778e-2);
797 cp4 = _mm_set1_ps(0.49603e-4);
798 cp5 = _mm_set1_ps(0.551e-6);
800 for (; number < quarterPoints; number++) {
801 aVal = _mm_loadu_ps(aPtr);
803 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
804 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
805 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
807 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
808 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
811 s, _mm_set1_ps(8.0));
812 s = _mm_mul_ps(s, s);
819 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
827 for (
i = 0;
i < 3;
i++) {
828 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
830 s = _mm_div_ps(s, ftwos);
832 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
833 cosine = _mm_sub_ps(fones, s);
835 condition1 = _mm_cmpneq_ps(
836 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
837 condition2 = _mm_cmpneq_ps(
838 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
839 _mm_cmplt_ps(aVal, fzeroes));
841 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
843 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
844 _mm_storeu_ps(bPtr, sine);
849 number = quarterPoints * 4;
850 for (; number < num_points; number++) {
851 *bPtr++ = sinf(*aPtr++);
858 #ifdef LV_HAVE_GENERIC
863 float* bPtr = bVector;
864 const float* aPtr = aVector;
865 unsigned int number = 0;
867 for (number = 0; number < num_points; number++) {
868 *bPtr++ = sinf(*aPtr++);
876 #include <arm_neon.h>
882 unsigned int number = 0;
883 unsigned int quarter_points = num_points / 4;
884 float* bVectorPtr = bVector;
885 const float* aVectorPtr = aVector;
890 for (number = 0; number < quarter_points; number++) {
891 a_vec = vld1q_f32(aVectorPtr);
895 vst1q_f32(bVectorPtr, b_vec);
902 for (number = quarter_points * 4; number < num_points; number++) {
903 *bVectorPtr++ = sinf(*aVectorPtr++);
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:861
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:880
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:262