Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32f_stddev_and_mean_32f_x2.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014, 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
71 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
72 
73 #include <inttypes.h>
74 #include <math.h>
75 #include <volk/volk_common.h>
76 
77 // Youngs and Cramer's Algorithm for calculating std and mean
78 // Using the methods discussed here:
79 // https://doi.org/10.1145/3221269.3223036
80 #ifdef LV_HAVE_GENERIC
81 
82 static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
83  float* mean,
84  const float* inputBuffer,
85  unsigned int num_points)
86 {
87  const float* in_ptr = inputBuffer;
88  if (num_points == 0) {
89  return;
90  } else if (num_points == 1) {
91  *stddev = 0.f;
92  *mean = (*in_ptr);
93  return;
94  }
95 
96  float Sum[2];
97  float SquareSum[2] = { 0.f, 0.f };
98  Sum[0] = (*in_ptr++);
99  Sum[1] = (*in_ptr++);
100 
101  uint32_t half_points = num_points / 2;
102 
103  for (uint32_t number = 1; number < half_points; number++) {
104  float Val0 = (*in_ptr++);
105  float Val1 = (*in_ptr++);
106  float n = (float)number;
107  float n_plus_one = n + 1.f;
108  float r = 1.f / (n * n_plus_one);
109 
110  Sum[0] += Val0;
111  Sum[1] += Val1;
112 
113  SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
114  SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
115  }
116 
117  SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
118  Sum[0] += Sum[1];
119 
120  uint32_t points_done = half_points * 2;
121 
122  for (; points_done < num_points; points_done++) {
123  float Val = (*in_ptr++);
124  float n = (float)points_done;
125  float n_plus_one = n + 1.f;
126  float r = 1.f / (n * n_plus_one);
127  Sum[0] += Val;
128  SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
129  }
130  *stddev = sqrtf(SquareSum[0] / num_points);
131  *mean = Sum[0] / num_points;
132 }
133 #endif /* LV_HAVE_GENERIC */
134 
135 static inline float update_square_sum_1_val(const float SquareSum,
136  const float Sum,
137  const uint32_t len,
138  const float val)
139 {
140  // Updates a sum of squares calculated over len values with the value val
141  float n = (float)len;
142  float n_plus_one = n + 1.f;
143  return SquareSum +
144  1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum);
145 }
146 
147 static inline float add_square_sums(const float SquareSum0,
148  const float Sum0,
149  const float SquareSum1,
150  const float Sum1,
151  const uint32_t len)
152 {
153  // Add two sums of squares calculated over the same number of values, len
154  float n = (float)len;
155  return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
156 }
157 
158 static inline void accrue_result(float* PartialSquareSums,
159  float* PartialSums,
160  const uint32_t NumberOfPartitions,
161  const uint32_t PartitionLen)
162 {
163  // Add all partial sums and square sums into the first element of the arrays
164  uint32_t accumulators = NumberOfPartitions;
165  uint32_t stages = 0;
166  uint32_t offset = 1;
167  uint32_t partition_len = PartitionLen;
168 
169  while (accumulators >>= 1) {
170  stages++;
171  } // Integer log2
172  accumulators = NumberOfPartitions;
173 
174  for (uint32_t s = 0; s < stages; s++) {
175  accumulators /= 2;
176  uint32_t idx = 0;
177  for (uint32_t a = 0; a < accumulators; a++) {
178  PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx],
179  PartialSums[idx],
180  PartialSquareSums[idx + offset],
181  PartialSums[idx + offset],
182  partition_len);
183  PartialSums[idx] += PartialSums[idx + offset];
184  idx += 2 * offset;
185  }
186  offset *= 2;
187  partition_len *= 2;
188  }
189 }
190 
191 #ifdef LV_HAVE_NEON
192 #include <arm_neon.h>
194 
195 static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev,
196  float* mean,
197  const float* inputBuffer,
198  unsigned int num_points)
199 {
200  if (num_points < 8) {
201  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
202  return;
203  }
204 
205  const float* in_ptr = inputBuffer;
206 
207  __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f };
208  __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f };
209 
210  const uint32_t eigth_points = num_points / 8;
211 
212  float32x4_t Sum0, Sum1;
213 
214  Sum0 = vld1q_f32((const float32_t*)in_ptr);
215  in_ptr += 4;
216  __VOLK_PREFETCH(in_ptr + 4);
217 
218  Sum1 = vld1q_f32((const float32_t*)in_ptr);
219  in_ptr += 4;
220  __VOLK_PREFETCH(in_ptr + 4);
221 
222  float32x4_t SquareSum0 = { 0.f };
223  float32x4_t SquareSum1 = { 0.f };
224 
225  float32x4_t Values0, Values1;
226  float32x4_t Aux0, Aux1;
227  float32x4_t Reciprocal;
228 
229  for (uint32_t number = 1; number < eigth_points; number++) {
230  Values0 = vld1q_f32(in_ptr);
231  in_ptr += 4;
232  __VOLK_PREFETCH(in_ptr + 4);
233 
234  Values1 = vld1q_f32(in_ptr);
235  in_ptr += 4;
236  __VOLK_PREFETCH(in_ptr + 4);
237 
238  float n = (float)number;
239  float n_plus_one = n + 1.f;
240  Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
241 
242  Sum0 = vaddq_f32(Sum0, Values0);
243  Aux0 = vdupq_n_f32(n_plus_one);
244  SquareSum0 =
245  _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
246 
247  Sum1 = vaddq_f32(Sum1, Values1);
248  Aux1 = vdupq_n_f32(n_plus_one);
249  SquareSum1 =
250  _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
251  }
252 
253  vst1q_f32(&SumLocal[0], Sum0);
254  vst1q_f32(&SumLocal[4], Sum1);
255  vst1q_f32(&SquareSumLocal[0], SquareSum0);
256  vst1q_f32(&SquareSumLocal[4], SquareSum1);
257 
258  accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
259 
260  uint32_t points_done = eigth_points * 8;
261 
262  for (; points_done < num_points; points_done++) {
263  float val = (*in_ptr++);
264  SumLocal[0] += val;
265  SquareSumLocal[0] =
266  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
267  }
268 
269  *stddev = sqrtf(SquareSumLocal[0] / num_points);
270  *mean = SumLocal[0] / num_points;
271 }
272 #endif /* LV_HAVE_NEON */
273 
274 #ifdef LV_HAVE_SSE
276 #include <xmmintrin.h>
277 
278 static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev,
279  float* mean,
280  const float* inputBuffer,
281  unsigned int num_points)
282 {
283  if (num_points < 8) {
284  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
285  return;
286  }
287 
288  const float* in_ptr = inputBuffer;
289 
290  __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
291  __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
292 
293 
294  const uint32_t eigth_points = num_points / 8;
295 
296  __m128 Sum0 = _mm_loadu_ps(in_ptr);
297  in_ptr += 4;
298  __m128 Sum1 = _mm_loadu_ps(in_ptr);
299  in_ptr += 4;
300  __m128 SquareSum0 = _mm_setzero_ps();
301  __m128 SquareSum1 = _mm_setzero_ps();
302  __m128 Values0, Values1;
303  __m128 Aux0, Aux1;
304  __m128 Reciprocal;
305 
306  for (uint32_t number = 1; number < eigth_points; number++) {
307  Values0 = _mm_loadu_ps(in_ptr);
308  in_ptr += 4;
309  __VOLK_PREFETCH(in_ptr + 4);
310 
311  Values1 = _mm_loadu_ps(in_ptr);
312  in_ptr += 4;
313  __VOLK_PREFETCH(in_ptr + 4);
314 
315  float n = (float)number;
316  float n_plus_one = n + 1.f;
317  Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
318 
319  Sum0 = _mm_add_ps(Sum0, Values0);
320  Aux0 = _mm_set_ps1(n_plus_one);
321  SquareSum0 =
322  _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
323 
324  Sum1 = _mm_add_ps(Sum1, Values1);
325  Aux1 = _mm_set_ps1(n_plus_one);
326  SquareSum1 =
327  _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
328  }
329 
330  _mm_store_ps(&SumLocal[0], Sum0);
331  _mm_store_ps(&SumLocal[4], Sum1);
332  _mm_store_ps(&SquareSumLocal[0], SquareSum0);
333  _mm_store_ps(&SquareSumLocal[4], SquareSum1);
334 
335  accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
336 
337  uint32_t points_done = eigth_points * 8;
338 
339  for (; points_done < num_points; points_done++) {
340  float val = (*in_ptr++);
341  SumLocal[0] += val;
342  SquareSumLocal[0] =
343  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
344  }
345 
346  *stddev = sqrtf(SquareSumLocal[0] / num_points);
347  *mean = SumLocal[0] / num_points;
348 }
349 #endif /* LV_HAVE_SSE */
350 
351 #ifdef LV_HAVE_AVX
352 #include <immintrin.h>
354 
355 static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
356  float* mean,
357  const float* inputBuffer,
358  unsigned int num_points)
359 {
360  if (num_points < 16) {
361  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
362  return;
363  }
364 
365  const float* in_ptr = inputBuffer;
366 
367  __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
368  __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
369 
370  const unsigned int sixteenth_points = num_points / 16;
371 
372  __m256 Sum0 = _mm256_loadu_ps(in_ptr);
373  in_ptr += 8;
374  __m256 Sum1 = _mm256_loadu_ps(in_ptr);
375  in_ptr += 8;
376 
377  __m256 SquareSum0 = _mm256_setzero_ps();
378  __m256 SquareSum1 = _mm256_setzero_ps();
379  __m256 Values0, Values1;
380  __m256 Aux0, Aux1;
381  __m256 Reciprocal;
382 
383  for (uint32_t number = 1; number < sixteenth_points; number++) {
384  Values0 = _mm256_loadu_ps(in_ptr);
385  in_ptr += 8;
386  __VOLK_PREFETCH(in_ptr + 8);
387 
388  Values1 = _mm256_loadu_ps(in_ptr);
389  in_ptr += 8;
390  __VOLK_PREFETCH(in_ptr + 8);
391 
392  float n = (float)number;
393  float n_plus_one = n + 1.f;
394 
395  Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
396 
397  Sum0 = _mm256_add_ps(Sum0, Values0);
398  Aux0 = _mm256_set1_ps(n_plus_one);
399  SquareSum0 =
400  _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
401 
402  Sum1 = _mm256_add_ps(Sum1, Values1);
403  Aux1 = _mm256_set1_ps(n_plus_one);
404  SquareSum1 =
405  _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
406  }
407 
408  _mm256_store_ps(&SumLocal[0], Sum0);
409  _mm256_store_ps(&SumLocal[8], Sum1);
410  _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
411  _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
412 
413  accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
414 
415  uint32_t points_done = sixteenth_points * 16;
416 
417  for (; points_done < num_points; points_done++) {
418  float val = (*in_ptr++);
419  SumLocal[0] += val;
420  SquareSumLocal[0] =
421  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
422  }
423 
424  *stddev = sqrtf(SquareSumLocal[0] / num_points);
425  *mean = SumLocal[0] / num_points;
426 }
427 #endif /* LV_HAVE_AVX */
428 
429 #ifdef LV_HAVE_SSE
430 #include <xmmintrin.h>
431 
432 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
433  float* mean,
434  const float* inputBuffer,
435  unsigned int num_points)
436 {
437  if (num_points < 8) {
438  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
439  return;
440  }
441 
442  const float* in_ptr = inputBuffer;
443 
444  __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
445  __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
446 
447 
448  const uint32_t eigth_points = num_points / 8;
449 
450  __m128 Sum0 = _mm_load_ps(in_ptr);
451  in_ptr += 4;
452  __m128 Sum1 = _mm_load_ps(in_ptr);
453  in_ptr += 4;
454  __m128 SquareSum0 = _mm_setzero_ps();
455  __m128 SquareSum1 = _mm_setzero_ps();
456  __m128 Values0, Values1;
457  __m128 Aux0, Aux1;
458  __m128 Reciprocal;
459 
460  for (uint32_t number = 1; number < eigth_points; number++) {
461  Values0 = _mm_load_ps(in_ptr);
462  in_ptr += 4;
463  __VOLK_PREFETCH(in_ptr + 4);
464 
465  Values1 = _mm_load_ps(in_ptr);
466  in_ptr += 4;
467  __VOLK_PREFETCH(in_ptr + 4);
468 
469  float n = (float)number;
470  float n_plus_one = n + 1.f;
471  Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
472 
473  Sum0 = _mm_add_ps(Sum0, Values0);
474  Aux0 = _mm_set_ps1(n_plus_one);
475  SquareSum0 =
476  _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
477 
478  Sum1 = _mm_add_ps(Sum1, Values1);
479  Aux1 = _mm_set_ps1(n_plus_one);
480  SquareSum1 =
481  _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
482  }
483 
484  _mm_store_ps(&SumLocal[0], Sum0);
485  _mm_store_ps(&SumLocal[4], Sum1);
486  _mm_store_ps(&SquareSumLocal[0], SquareSum0);
487  _mm_store_ps(&SquareSumLocal[4], SquareSum1);
488 
489  accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
490 
491  uint32_t points_done = eigth_points * 8;
492 
493  for (; points_done < num_points; points_done++) {
494  float val = (*in_ptr++);
495  SumLocal[0] += val;
496  SquareSumLocal[0] =
497  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
498  }
499 
500  *stddev = sqrtf(SquareSumLocal[0] / num_points);
501  *mean = SumLocal[0] / num_points;
502 }
503 #endif /* LV_HAVE_SSE */
504 
505 #ifdef LV_HAVE_AVX
506 #include <immintrin.h>
507 
508 static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
509  float* mean,
510  const float* inputBuffer,
511  unsigned int num_points)
512 {
513  if (num_points < 16) {
514  volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
515  return;
516  }
517 
518  const float* in_ptr = inputBuffer;
519 
520  __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
521  __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
522 
523  const unsigned int sixteenth_points = num_points / 16;
524 
525  __m256 Sum0 = _mm256_load_ps(in_ptr);
526  in_ptr += 8;
527  __m256 Sum1 = _mm256_load_ps(in_ptr);
528  in_ptr += 8;
529 
530  __m256 SquareSum0 = _mm256_setzero_ps();
531  __m256 SquareSum1 = _mm256_setzero_ps();
532  __m256 Values0, Values1;
533  __m256 Aux0, Aux1;
534  __m256 Reciprocal;
535 
536  for (uint32_t number = 1; number < sixteenth_points; number++) {
537  Values0 = _mm256_load_ps(in_ptr);
538  in_ptr += 8;
539  __VOLK_PREFETCH(in_ptr + 8);
540 
541  Values1 = _mm256_load_ps(in_ptr);
542  in_ptr += 8;
543  __VOLK_PREFETCH(in_ptr + 8);
544 
545  float n = (float)number;
546  float n_plus_one = n + 1.f;
547 
548  Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
549 
550  Sum0 = _mm256_add_ps(Sum0, Values0);
551  Aux0 = _mm256_set1_ps(n_plus_one);
552  SquareSum0 =
553  _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
554 
555  Sum1 = _mm256_add_ps(Sum1, Values1);
556  Aux1 = _mm256_set1_ps(n_plus_one);
557  SquareSum1 =
558  _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
559  }
560 
561  _mm256_store_ps(&SumLocal[0], Sum0);
562  _mm256_store_ps(&SumLocal[8], Sum1);
563  _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
564  _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
565 
566  accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
567 
568  uint32_t points_done = sixteenth_points * 16;
569 
570  for (; points_done < num_points; points_done++) {
571  float val = (*in_ptr++);
572  SumLocal[0] += val;
573  SquareSumLocal[0] =
574  update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
575  }
576 
577  *stddev = sqrtf(SquareSumLocal[0] / num_points);
578  *mean = SumLocal[0] / num_points;
579 }
580 #endif /* LV_HAVE_AVX */
581 
582 #endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
val
Definition: volk_arch_defs.py:66
static void volk_32f_stddev_and_mean_32f_x2_u_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:278
static float add_square_sums(const float SquareSum0, const float Sum0, const float SquareSum1, const float Sum1, const uint32_t len)
Definition: volk_32f_stddev_and_mean_32f_x2.h:147
static void accrue_result(float *PartialSquareSums, float *PartialSums, const uint32_t NumberOfPartitions, const uint32_t PartitionLen)
Definition: volk_32f_stddev_and_mean_32f_x2.h:158
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:355
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:82
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:508
static void volk_32f_stddev_and_mean_32f_x2_neon(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:195
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:432
static float update_square_sum_1_val(const float SquareSum, const float Sum, const uint32_t len, const float val)
Definition: volk_32f_stddev_and_mean_32f_x2.h:135
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:198
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:280
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:62