72 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
73 #define INCLUDED_volk_32f_binary_slicer_8i_H
76 #ifdef LV_HAVE_GENERIC
80 unsigned int num_points)
82 int8_t* cPtr = cVector;
83 const float* aPtr = aVector;
84 unsigned int number = 0;
86 for (number = 0; number < num_points; number++) {
97 #ifdef LV_HAVE_GENERIC
100 const float* aVector,
101 unsigned int num_points)
103 int8_t* cPtr = cVector;
104 const float* aPtr = aVector;
105 unsigned int number = 0;
107 for (number = 0; number < num_points; number++) {
108 *cPtr++ = (*aPtr++ >= 0);
115 #include <immintrin.h>
117 static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
118 const float* aVector,
119 unsigned int num_points)
121 int8_t* cPtr = cVector;
122 const float* aPtr = aVector;
123 unsigned int number = 0;
124 unsigned int n32points = num_points / 32;
126 const __m256 zero_val = _mm256_set1_ps(0.0f);
127 __m256 a0_val, a1_val, a2_val, a3_val;
128 __m256 res0_f, res1_f, res2_f, res3_f;
129 __m256i res0_i, res1_i, res2_i, res3_i;
130 __m256i byte_shuffle = _mm256_set_epi8(15,
163 for (number = 0; number < n32points; number++) {
164 a0_val = _mm256_load_ps(aPtr);
165 a1_val = _mm256_load_ps(aPtr + 8);
166 a2_val = _mm256_load_ps(aPtr + 16);
167 a3_val = _mm256_load_ps(aPtr + 24);
170 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
171 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
172 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
173 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
176 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
177 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
178 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
179 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
182 res0_i = _mm256_packs_epi32(res0_i, res1_i);
183 res2_i = _mm256_packs_epi32(res2_i, res3_i);
189 res0_i = _mm256_packs_epi16(res0_i, res2_i);
195 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
201 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
203 _mm256_store_si256((__m256i*)cPtr, res0_i);
208 for (number = n32points * 32; number < num_points; number++) {
219 #include <immintrin.h>
221 static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
222 const float* aVector,
223 unsigned int num_points)
225 int8_t* cPtr = cVector;
226 const float* aPtr = aVector;
227 unsigned int number = 0;
228 unsigned int n32points = num_points / 32;
230 const __m256 zero_val = _mm256_set1_ps(0.0f);
231 __m256 a0_val, a1_val, a2_val, a3_val;
232 __m256 res0_f, res1_f, res2_f, res3_f;
233 __m256i res0_i, res1_i, res2_i, res3_i;
234 __m256i byte_shuffle = _mm256_set_epi8(15,
267 for (number = 0; number < n32points; number++) {
268 a0_val = _mm256_loadu_ps(aPtr);
269 a1_val = _mm256_loadu_ps(aPtr + 8);
270 a2_val = _mm256_loadu_ps(aPtr + 16);
271 a3_val = _mm256_loadu_ps(aPtr + 24);
274 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
275 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
276 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
277 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
280 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
281 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
282 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
283 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
286 res0_i = _mm256_packs_epi32(res0_i, res1_i);
287 res2_i = _mm256_packs_epi32(res2_i, res3_i);
293 res0_i = _mm256_packs_epi16(res0_i, res2_i);
299 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
305 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
307 _mm256_storeu_si256((__m256i*)cPtr, res0_i);
312 for (number = n32points * 32; number < num_points; number++) {
325 #include <emmintrin.h>
328 const float* aVector,
329 unsigned int num_points)
331 int8_t* cPtr = cVector;
332 const float* aPtr = aVector;
333 unsigned int number = 0;
335 unsigned int n16points = num_points / 16;
336 __m128 a0_val, a1_val, a2_val, a3_val;
337 __m128 res0_f, res1_f, res2_f, res3_f;
338 __m128i res0_i, res1_i, res2_i, res3_i;
340 zero_val = _mm_set1_ps(0.0f);
342 for (number = 0; number < n16points; number++) {
343 a0_val = _mm_load_ps(aPtr);
344 a1_val = _mm_load_ps(aPtr + 4);
345 a2_val = _mm_load_ps(aPtr + 8);
346 a3_val = _mm_load_ps(aPtr + 12);
349 res0_f = _mm_cmpge_ps(a0_val, zero_val);
350 res1_f = _mm_cmpge_ps(a1_val, zero_val);
351 res2_f = _mm_cmpge_ps(a2_val, zero_val);
352 res3_f = _mm_cmpge_ps(a3_val, zero_val);
355 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
356 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
357 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
358 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
361 res0_i = _mm_packs_epi32(res0_i, res1_i);
362 res2_i = _mm_packs_epi32(res2_i, res3_i);
365 res0_i = _mm_packs_epi16(res0_i, res2_i);
367 _mm_store_si128((__m128i*)cPtr, res0_i);
373 for (number = n16points * 16; number < num_points; number++) {
385 #include <emmintrin.h>
388 const float* aVector,
389 unsigned int num_points)
391 int8_t* cPtr = cVector;
392 const float* aPtr = aVector;
393 unsigned int number = 0;
395 unsigned int n16points = num_points / 16;
396 __m128 a0_val, a1_val, a2_val, a3_val;
397 __m128 res0_f, res1_f, res2_f, res3_f;
398 __m128i res0_i, res1_i, res2_i, res3_i;
400 zero_val = _mm_set1_ps(0.0f);
402 for (number = 0; number < n16points; number++) {
403 a0_val = _mm_loadu_ps(aPtr);
404 a1_val = _mm_loadu_ps(aPtr + 4);
405 a2_val = _mm_loadu_ps(aPtr + 8);
406 a3_val = _mm_loadu_ps(aPtr + 12);
409 res0_f = _mm_cmpge_ps(a0_val, zero_val);
410 res1_f = _mm_cmpge_ps(a1_val, zero_val);
411 res2_f = _mm_cmpge_ps(a2_val, zero_val);
412 res3_f = _mm_cmpge_ps(a3_val, zero_val);
415 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
416 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
417 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
418 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
421 res0_i = _mm_packs_epi32(res0_i, res1_i);
422 res2_i = _mm_packs_epi32(res2_i, res3_i);
425 res0_i = _mm_packs_epi16(res0_i, res2_i);
427 _mm_storeu_si128((__m128i*)cPtr, res0_i);
433 for (number = n16points * 16; number < num_points; number++) {
445 #include <arm_neon.h>
448 const float* aVector,
449 unsigned int num_points)
451 int8_t* cPtr = cVector;
452 const float* aPtr = aVector;
453 unsigned int number = 0;
454 unsigned int n16points = num_points / 16;
456 float32x4x2_t input_val0, input_val1;
457 float32x4_t zero_val;
458 uint32x4x2_t res0_u32, res1_u32;
459 uint16x4x2_t res0_u16x4, res1_u16x4;
460 uint16x8x2_t res_u16x8;
464 zero_val = vdupq_n_f32(0.0);
465 one = vdup_n_u8(0x01);
470 for (number = 0; number < n16points; number++) {
471 input_val0 = vld2q_f32(aPtr);
472 input_val1 = vld2q_f32(aPtr + 8);
475 res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
476 res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
477 res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
478 res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
481 res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
482 res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
483 res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
484 res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
486 res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
487 res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
490 res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
491 res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
498 res_u8.val[0] = vand_u8(one, res_u8.val[0]);
499 res_u8.val[1] = vand_u8(one, res_u8.val[1]);
501 vst2_u8((
unsigned char*)cPtr, res_u8);
506 for (number = n16points * 16; number < num_points; number++) {
static void volk_32f_binary_slicer_8i_generic_branchless(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:99
static void volk_32f_binary_slicer_8i_a_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:327
static void volk_32f_binary_slicer_8i_u_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:387
static void volk_32f_binary_slicer_8i_neon(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:447
static void volk_32f_binary_slicer_8i_generic(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:78