78 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
79 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
86 #include <immintrin.h>
88 static inline void volk_32fc_x2_square_dist_32f_a_avx2(
float* target,
91 unsigned int num_points)
93 const unsigned int num_bytes = num_points * 8;
94 __m128 xmm0, xmm9, xmm10;
95 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
99 int bound = num_bytes >> 6;
100 int leftovers0 = (num_bytes >> 5) & 1;
101 int leftovers1 = (num_bytes >> 4) & 1;
102 int leftovers2 = (num_bytes >> 3) & 1;
105 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
106 xmm1 = _mm256_setzero_ps();
107 xmm2 = _mm256_load_ps((
float*)&points[0]);
108 xmm0 = _mm_load_ps((
float*)src0);
109 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
110 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
111 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
112 xmm3 = _mm256_load_ps((
float*)&points[4]);
114 for (;
i < bound; ++
i) {
115 xmm4 = _mm256_sub_ps(xmm1, xmm2);
116 xmm5 = _mm256_sub_ps(xmm1, xmm3);
118 xmm6 = _mm256_mul_ps(xmm4, xmm4);
119 xmm7 = _mm256_mul_ps(xmm5, xmm5);
121 xmm2 = _mm256_load_ps((
float*)&points[0]);
123 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
124 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
126 xmm3 = _mm256_load_ps((
float*)&points[4]);
128 _mm256_store_ps(target, xmm4);
133 for (
i = 0;
i < leftovers0; ++
i) {
135 xmm2 = _mm256_load_ps((
float*)&points[0]);
137 xmm4 = _mm256_sub_ps(xmm1, xmm2);
141 xmm6 = _mm256_mul_ps(xmm4, xmm4);
143 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
144 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
146 xmm9 = _mm256_extractf128_ps(xmm4, 1);
147 _mm_store_ps(target, xmm9);
152 for (
i = 0;
i < leftovers1; ++
i) {
153 xmm9 = _mm_load_ps((
float*)&points[0]);
155 xmm10 = _mm_sub_ps(xmm0, xmm9);
159 xmm9 = _mm_mul_ps(xmm10, xmm10);
161 xmm10 = _mm_hadd_ps(xmm9, xmm9);
163 _mm_storeh_pi((__m64*)target, xmm10);
168 for (
i = 0;
i < leftovers2; ++
i) {
170 diff = src0[0] - points[0];
181 #include <pmmintrin.h>
182 #include <xmmintrin.h>
187 unsigned int num_points)
189 const unsigned int num_bytes = num_points * 8;
191 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
195 int bound = num_bytes >> 5;
198 xmm1 = _mm_setzero_ps();
199 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
200 xmm2 = _mm_load_ps((
float*)&points[0]);
201 xmm1 = _mm_movelh_ps(xmm1, xmm1);
202 xmm3 = _mm_load_ps((
float*)&points[2]);
204 for (;
i < bound - 1; ++
i) {
205 xmm4 = _mm_sub_ps(xmm1, xmm2);
206 xmm5 = _mm_sub_ps(xmm1, xmm3);
208 xmm6 = _mm_mul_ps(xmm4, xmm4);
209 xmm7 = _mm_mul_ps(xmm5, xmm5);
211 xmm2 = _mm_load_ps((
float*)&points[0]);
213 xmm4 = _mm_hadd_ps(xmm6, xmm7);
215 xmm3 = _mm_load_ps((
float*)&points[2]);
217 _mm_store_ps(target, xmm4);
222 xmm4 = _mm_sub_ps(xmm1, xmm2);
223 xmm5 = _mm_sub_ps(xmm1, xmm3);
226 xmm6 = _mm_mul_ps(xmm4, xmm4);
227 xmm7 = _mm_mul_ps(xmm5, xmm5);
229 xmm4 = _mm_hadd_ps(xmm6, xmm7);
231 _mm_store_ps(target, xmm4);
235 if (num_bytes >> 4 & 1) {
237 xmm2 = _mm_load_ps((
float*)&points[0]);
239 xmm4 = _mm_sub_ps(xmm1, xmm2);
243 xmm6 = _mm_mul_ps(xmm4, xmm4);
245 xmm4 = _mm_hadd_ps(xmm6, xmm6);
247 _mm_storeh_pi((__m64*)target, xmm4);
252 if (num_bytes >> 3 & 1) {
254 diff = src0[0] - points[0];
266 #include <arm_neon.h>
270 unsigned int num_points)
272 const unsigned int quarter_points = num_points / 4;
275 float32x4x2_t a_vec, b_vec;
276 float32x4x2_t diff_vec;
277 float32x4_t tmp, tmp1, dist_sq;
278 a_vec.val[0] = vdupq_n_f32(
lv_creal(src0[0]));
279 a_vec.val[1] = vdupq_n_f32(
lv_cimag(src0[0]));
280 for (number = 0; number < quarter_points; ++number) {
281 b_vec = vld2q_f32((
float*)points);
282 diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
283 diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
284 tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
285 tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
287 dist_sq = vaddq_f32(tmp, tmp1);
288 vst1q_f32(target, dist_sq);
292 for (number = quarter_points * 4; number < num_points; ++number) {
300 #ifdef LV_HAVE_GENERIC
304 unsigned int num_points)
306 const unsigned int num_bytes = num_points * 8;
312 for (; i<num_bytes>> 3; ++
i) {
313 diff = src0[0] - points[
i];
326 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
327 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
329 #include <inttypes.h>
334 #include <immintrin.h>
336 static inline void volk_32fc_x2_square_dist_32f_u_avx2(
float* target,
339 unsigned int num_points)
341 const unsigned int num_bytes = num_points * 8;
343 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
347 int bound = num_bytes >> 6;
348 int leftovers1 = (num_bytes >> 3) & 0b11;
351 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
352 xmm1 = _mm256_setzero_ps();
353 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
354 xmm0 = _mm_loadu_ps((
float*)src0);
355 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
356 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
357 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
358 xmm3 = _mm256_loadu_ps((
float*)&points[4]);
360 for (;
i < bound; ++
i) {
361 xmm4 = _mm256_sub_ps(xmm1, xmm2);
362 xmm5 = _mm256_sub_ps(xmm1, xmm3);
364 xmm6 = _mm256_mul_ps(xmm4, xmm4);
365 xmm7 = _mm256_mul_ps(xmm5, xmm5);
367 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
369 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
370 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
372 xmm3 = _mm256_loadu_ps((
float*)&points[4]);
374 _mm256_storeu_ps(target, xmm4);
379 if (num_bytes >> 5 & 1) {
381 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
383 xmm4 = _mm256_sub_ps(xmm1, xmm2);
387 xmm6 = _mm256_mul_ps(xmm4, xmm4);
389 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
390 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
392 xmm9 = _mm256_extractf128_ps(xmm4, 1);
393 _mm_storeu_ps(target, xmm9);
398 for (
i = 0;
i < leftovers1; ++
i) {
400 diff = src0[0] - points[0];
static void volk_32fc_x2_square_dist_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:301
static void volk_32fc_x2_square_dist_32f_neon(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:267
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:184
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25