1 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
2 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
11 static inline void volk_32f_x2_dot_prod_32f_a_generic(
float * result,
const float * input,
const float *
taps,
unsigned int num_points) {
14 const float* aPtr = input;
15 const float* bPtr=
taps;
16 unsigned int number = 0;
18 for(number = 0; number < num_points; number++){
19 dotProduct += ((*aPtr++) * (*bPtr++));
31 static inline void volk_32f_x2_dot_prod_32f_a_sse(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
33 unsigned int number = 0;
34 const unsigned int quarterPoints = num_points / 4;
37 const float* aPtr = input;
38 const float* bPtr =
taps;
40 __m128 aVal, bVal, cVal;
42 __m128 dotProdVal = _mm_setzero_ps();
44 for(;number < quarterPoints; number++){
46 aVal = _mm_load_ps(aPtr);
47 bVal = _mm_load_ps(bPtr);
49 cVal = _mm_mul_ps(aVal, bVal);
51 dotProdVal = _mm_add_ps(cVal, dotProdVal);
59 _mm_store_ps(dotProductVector,dotProdVal);
61 dotProduct = dotProductVector[0];
62 dotProduct += dotProductVector[1];
63 dotProduct += dotProductVector[2];
64 dotProduct += dotProductVector[3];
66 number = quarterPoints * 4;
67 for(;number < num_points; number++){
68 dotProduct += ((*aPtr++) * (*bPtr++));
79 #include <pmmintrin.h>
81 static inline void volk_32f_x2_dot_prod_32f_a_sse3(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
82 unsigned int number = 0;
83 const unsigned int quarterPoints = num_points / 4;
86 const float* aPtr = input;
87 const float* bPtr =
taps;
89 __m128 aVal, bVal, cVal;
91 __m128 dotProdVal = _mm_setzero_ps();
93 for(;number < quarterPoints; number++){
95 aVal = _mm_load_ps(aPtr);
96 bVal = _mm_load_ps(bPtr);
98 cVal = _mm_mul_ps(aVal, bVal);
100 dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
107 dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
109 _mm_store_ps(dotProductVector,dotProdVal);
111 dotProduct = dotProductVector[0];
112 dotProduct += dotProductVector[1];
114 number = quarterPoints * 4;
115 for(;number < num_points; number++){
116 dotProduct += ((*aPtr++) * (*bPtr++));
119 *result = dotProduct;
124 #ifdef LV_HAVE_SSE4_1
126 #include <smmintrin.h>
128 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
129 unsigned int number = 0;
130 const unsigned int sixteenthPoints = num_points / 16;
132 float dotProduct = 0;
133 const float* aPtr = input;
134 const float* bPtr =
taps;
136 __m128 aVal1, bVal1, cVal1;
137 __m128 aVal2, bVal2, cVal2;
138 __m128 aVal3, bVal3, cVal3;
139 __m128 aVal4, bVal4, cVal4;
141 __m128 dotProdVal = _mm_setzero_ps();
143 for(;number < sixteenthPoints; number++){
145 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
146 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
147 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
148 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
150 bVal1 = _mm_load_ps(bPtr); bPtr += 4;
151 bVal2 = _mm_load_ps(bPtr); bPtr += 4;
152 bVal3 = _mm_load_ps(bPtr); bPtr += 4;
153 bVal4 = _mm_load_ps(bPtr); bPtr += 4;
155 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
156 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
157 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
158 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
160 cVal1 = _mm_or_ps(cVal1, cVal2);
161 cVal3 = _mm_or_ps(cVal3, cVal4);
162 cVal1 = _mm_or_ps(cVal1, cVal3);
164 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
168 _mm_store_ps(dotProductVector, dotProdVal);
170 dotProduct = dotProductVector[0];
171 dotProduct += dotProductVector[1];
172 dotProduct += dotProductVector[2];
173 dotProduct += dotProductVector[3];
175 number = sixteenthPoints * 16;
176 for(;number < num_points; number++){
177 dotProduct += ((*aPtr++) * (*bPtr++));
180 *result = dotProduct;