1 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
2 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
10 static inline void volk_32f_x2_dot_prod_32f_u_generic(
float * result,
const float * input,
const float *
taps,
unsigned int num_points) {
13 const float* aPtr = input;
14 const float* bPtr=
taps;
15 unsigned int number = 0;
17 for(number = 0; number < num_points; number++){
18 dotProduct += ((*aPtr++) * (*bPtr++));
30 static inline void volk_32f_x2_dot_prod_32f_u_sse(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
32 unsigned int number = 0;
33 const unsigned int quarterPoints = num_points / 4;
36 const float* aPtr = input;
37 const float* bPtr =
taps;
39 __m128 aVal, bVal, cVal;
41 __m128 dotProdVal = _mm_setzero_ps();
43 for(;number < quarterPoints; number++){
45 aVal = _mm_loadu_ps(aPtr);
46 bVal = _mm_loadu_ps(bPtr);
48 cVal = _mm_mul_ps(aVal, bVal);
50 dotProdVal = _mm_add_ps(cVal, dotProdVal);
58 _mm_store_ps(dotProductVector,dotProdVal);
60 dotProduct = dotProductVector[0];
61 dotProduct += dotProductVector[1];
62 dotProduct += dotProductVector[2];
63 dotProduct += dotProductVector[3];
65 number = quarterPoints * 4;
66 for(;number < num_points; number++){
67 dotProduct += ((*aPtr++) * (*bPtr++));
78 #include <pmmintrin.h>
80 static inline void volk_32f_x2_dot_prod_32f_u_sse3(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
81 unsigned int number = 0;
82 const unsigned int quarterPoints = num_points / 4;
85 const float* aPtr = input;
86 const float* bPtr =
taps;
88 __m128 aVal, bVal, cVal;
90 __m128 dotProdVal = _mm_setzero_ps();
92 for(;number < quarterPoints; number++){
94 aVal = _mm_loadu_ps(aPtr);
95 bVal = _mm_loadu_ps(bPtr);
97 cVal = _mm_mul_ps(aVal, bVal);
99 dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
106 dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
108 _mm_store_ps(dotProductVector,dotProdVal);
110 dotProduct = dotProductVector[0];
111 dotProduct += dotProductVector[1];
113 number = quarterPoints * 4;
114 for(;number < num_points; number++){
115 dotProduct += ((*aPtr++) * (*bPtr++));
118 *result = dotProduct;
123 #ifdef LV_HAVE_SSE4_1
125 #include <smmintrin.h>
127 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
128 unsigned int number = 0;
129 const unsigned int sixteenthPoints = num_points / 16;
131 float dotProduct = 0;
132 const float* aPtr = input;
133 const float* bPtr =
taps;
135 __m128 aVal1, bVal1, cVal1;
136 __m128 aVal2, bVal2, cVal2;
137 __m128 aVal3, bVal3, cVal3;
138 __m128 aVal4, bVal4, cVal4;
140 __m128 dotProdVal = _mm_setzero_ps();
142 for(;number < sixteenthPoints; number++){
144 aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
145 aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
146 aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
147 aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
149 bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
150 bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
151 bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
152 bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
154 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
155 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
156 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
157 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
159 cVal1 = _mm_or_ps(cVal1, cVal2);
160 cVal3 = _mm_or_ps(cVal3, cVal4);
161 cVal1 = _mm_or_ps(cVal1, cVal3);
163 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
167 _mm_store_ps(dotProductVector, dotProdVal);
169 dotProduct = dotProductVector[0];
170 dotProduct += dotProductVector[1];
171 dotProduct += dotProductVector[2];
172 dotProduct += dotProductVector[3];
174 number = sixteenthPoints * 16;
175 for(;number < num_points; number++){
176 dotProduct += ((*aPtr++) * (*bPtr++));
179 *result = dotProduct;