28 #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
29 #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
31 #include <immintrin.h>
35 const __m128i zeros = _mm_set1_epi8(0x00);
36 const __m128i sign_extract = _mm_set1_epi8(0x80);
37 const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
69 __m256i sign_bits = _mm256_setzero_si256();
71 fbits = _mm_cmpgt_epi8(fbits, zeros);
72 fbits = _mm_and_si128(fbits, sign_extract);
73 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
74 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
75 sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
77 return _mm256_castsi256_ps(sign_bits);
90 llr0 = _mm256_xor_ps(llr0, sign_mask);
91 __m256 dst = _mm256_add_ps(llr0, llr1);
96 const __m256 cplxValue1)
98 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
99 const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0);
100 const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1);
101 const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
102 return _mm256_permutevar8x32_ps(complex_result, idx);
106 const __m256 symbols1,
107 const __m256 points0,
108 const __m256 points1,
116 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
117 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
119 return _mm256_mul_ps(norms, scalar);
142 __m256i* max_indices,
143 __m256i* current_indices,
144 __m256i indices_increment)
146 in0 = _mm256_mul_ps(in0, in0);
147 in1 = _mm256_mul_ps(in1, in1);
167 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
178 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
181 *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
192 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
193 _mm256_castsi256_ps(*current_indices),
197 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
204 __m256i* max_indices,
205 __m256i* current_indices,
206 __m256i indices_increment)
208 in0 = _mm256_mul_ps(in0, in0);
209 in1 = _mm256_mul_ps(in1, in1);
211 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
212 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
224 *max_values = _mm256_max_ps(abs_squared, *max_values);
227 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
228 _mm256_castsi256_ps(*current_indices),
231 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:105
static __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
Definition: volk_avx2_intrinsics.h:33
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:201
static __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1)
Definition: volk_avx2_intrinsics.h:95
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:81
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:139
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:158