GNU Radio 3.4.2 C++ API
|
00001 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H 00002 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H 00003 00004 00005 #include<inttypes.h> 00006 #include<stdio.h> 00007 00008 00009 #ifdef LV_HAVE_SSSE3 00010 00011 #include<xmmintrin.h> 00012 #include<emmintrin.h> 00013 #include<tmmintrin.h> 00014 00015 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { 00016 00017 const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; 00018 const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; 00019 const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; 00020 const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; 00021 00022 00023 00024 volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 00025 __m128i xmm5, xmm6, xmm7, xmm8; 00026 00027 xmm4 = _mm_load_si128((__m128i*)shufmask0); 00028 xmm5 = _mm_load_si128((__m128i*)shufmask1); 00029 xmm6 = _mm_load_si128((__m128i*)andmask0); 00030 xmm7 = _mm_load_si128((__m128i*)andmask1); 00031 00032 __m128i *p_target, *p_src0; 00033 00034 p_target = (__m128i*)target; 00035 p_src0 = (__m128i*)src0; 00036 00037 int bound = num_bytes >> 5; 00038 int intermediate = (num_bytes >> 4) & 1; 00039 int leftovers = (num_bytes >> 1) & 7; 00040 00041 int i = 0; 00042 00043 00044 for(i = 0; i < bound; ++i) { 00045 00046 xmm0 = _mm_load_si128(p_src0); 00047 xmm1 = _mm_load_si128(&p_src0[1]); 00048 00049 00050 00051 xmm2 = _mm_xor_si128(xmm2, xmm2); 00052 p_src0 += 2; 00053 00054 xmm3 = _mm_hsub_epi16(xmm0, xmm1); 00055 00056 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); 00057 00058 xmm8 = _mm_and_si128(xmm2, xmm6); 00059 xmm3 = _mm_and_si128(xmm2, xmm7); 00060 00061 00062 xmm8 = _mm_add_epi8(xmm8, xmm4); 00063 xmm3 = _mm_add_epi8(xmm3, xmm5); 00064 00065 xmm0 = _mm_shuffle_epi8(xmm0, xmm8); 00066 xmm1 = _mm_shuffle_epi8(xmm1, xmm3); 00067 00068 00069 xmm3 = _mm_add_epi16(xmm0, xmm1); 00070 00071 00072 _mm_store_si128(p_target, xmm3); 00073 00074 p_target += 1; 00075 00076 } 00077 00078 for(i = 0; i < intermediate; ++i) { 00079 00080 xmm0 = _mm_load_si128(p_src0); 00081 00082 00083 xmm2 = _mm_xor_si128(xmm2, xmm2); 00084 p_src0 += 1; 00085 00086 xmm3 = _mm_hsub_epi16(xmm0, xmm1); 00087 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); 00088 00089 xmm8 = _mm_and_si128(xmm2, xmm6); 00090 00091 xmm3 = _mm_add_epi8(xmm8, xmm4); 00092 00093 xmm0 = _mm_shuffle_epi8(xmm0, xmm3); 00094 00095 00096 _mm_storel_pd((double*)p_target, (__m128d)xmm0); 00097 00098 p_target = (__m128i*)((int8_t*)p_target + 8); 00099 00100 } 00101 00102 for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 00103 target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; 00104 } 00105 00106 00107 } 00108 00109 #endif /*LV_HAVE_SSSE3*/ 00110 00111 00112 #ifdef LV_HAVE_GENERIC 00113 static inline void volk_16i_max_star_horizontal_16i_a_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { 00114 00115 int i = 0; 00116 00117 int bound = num_bytes >> 1; 00118 00119 00120 for(i = 0; i < bound; i += 2) { 00121 target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; 00122 } 00123 00124 } 00125 00126 00127 00128 #endif /*LV_HAVE_GENERIC*/ 00129 00130 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/