GNU Radio 3.4.2 C++ API
|
00001 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H 00002 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H 00003 00004 00005 #include<volk/volk_complex.h> 00006 00007 00008 #ifdef LV_HAVE_GENERIC 00009 00010 00011 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { 00012 00013 float * res = (float*) result; 00014 float * in = (float*) input; 00015 float * tp = (float*) taps; 00016 unsigned int n_2_ccomplex_blocks = num_bytes >> 4; 00017 unsigned int isodd = (num_bytes >> 3) &1; 00018 00019 00020 00021 float sum0[2] = {0,0}; 00022 float sum1[2] = {0,0}; 00023 int i = 0; 00024 00025 00026 for(i = 0; i < n_2_ccomplex_blocks; ++i) { 00027 00028 sum0[0] += in[0] * tp[0] + in[1] * tp[1]; 00029 sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; 00030 sum1[0] += in[2] * tp[2] + in[3] * tp[3]; 00031 sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; 00032 00033 00034 in += 4; 00035 tp += 4; 00036 00037 } 00038 00039 00040 res[0] = sum0[0] + sum1[0]; 00041 res[1] = sum0[1] + sum1[1]; 00042 00043 00044 00045 for(i = 0; i < isodd; ++i) { 00046 00047 00048 *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); 00049 00050 } 00051 /* 00052 for(i = 0; i < num_bytes >> 3; ++i) { 00053 *result += input[i] * conjf(taps[i]); 00054 } 00055 */ 00056 } 00057 00058 #endif /*LV_HAVE_GENERIC*/ 00059 00060 #ifdef LV_HAVE_SSE3 00061 00062 #include <xmmintrin.h> 00063 #include <pmmintrin.h> 00064 #include <mmintrin.h> 00065 00066 00067 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { 00068 00069 __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; 00070 00071 union HalfMask { 00072 uint32_t intRep[4]; 00073 __m128 vec; 00074 } halfMask; 00075 00076 union NegMask { 00077 int intRep[4]; 00078 __m128 vec; 00079 } negMask; 00080 00081 unsigned int offset = 0; 00082 float Rsum=0, Isum=0; 00083 float Im,Re; 00084 00085 __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; 00086 __m128 zv = {0,0,0,0}; 00087 00088 halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; 00089 halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; 00090 00091 negMask.intRep[0] = negMask.intRep[2] = 0x80000000; 00092 negMask.intRep[1] = negMask.intRep[3] = 0; 00093 00094 // main loop 00095 while(num_bytes >= 4*sizeof(float)){ 00096 00097 in1 = _mm_loadu_ps( (float*) (input+offset) ); 00098 in2 = _mm_loadu_ps( (float*) (taps+offset) ); 00099 Rv = in1*in2; 00100 fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); 00101 Iv = in1*fehg; 00102 Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); 00103 Ivm = _mm_xor_ps( negMask.vec, Iv ); 00104 Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); 00105 _mm_store_ss( &Im, Is ); 00106 _mm_store_ss( &Re, Rs ); 00107 num_bytes -= 4*sizeof(float); 00108 offset += 2; 00109 Rsum += Re; 00110 Isum += Im; 00111 } 00112 00113 // handle the last complex case ... 00114 if(num_bytes > 0){ 00115 00116 if(num_bytes != 4){ 00117 // bad things are happening 00118 } 00119 00120 in1 = _mm_loadu_ps( (float*) (input+offset) ); 00121 in2 = _mm_loadu_ps( (float*) (taps+offset) ); 00122 Rv = _mm_and_ps(in1*in2, halfMask.vec); 00123 fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); 00124 Iv = _mm_and_ps(in1*fehg, halfMask.vec); 00125 Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); 00126 Ivm = _mm_xor_ps( negMask.vec, Iv ); 00127 Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); 00128 _mm_store_ss( &Im, Is ); 00129 _mm_store_ss( &Re, Rs ); 00130 Rsum += Re; 00131 Isum += Im; 00132 } 00133 00134 result[0] = lv_cmake(Rsum,Isum); 00135 return; 00136 } 00137 00138 #endif /*LV_HAVE_SSE3*/ 00139 00140 00141 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ 00142 00143 00144