Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_16u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_16u_byteswap_u_H
54 #define INCLUDED_volk_16u_byteswap_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_GENERIC
60 
61 static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
62  unsigned int num_points)
63 {
64  uint16_t* inputPtr = intsToSwap;
65  for (unsigned int point = 0; point < num_points; point++) {
66  uint16_t output = *inputPtr;
67  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
68  *inputPtr = output;
69  inputPtr++;
70  }
71 }
72 #endif /* LV_HAVE_GENERIC */
73 
74 
75 #if LV_HAVE_AVX2
76 #include <immintrin.h>
77 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
78 {
79  unsigned int number;
80 
81  const unsigned int nPerSet = 16;
82  const uint64_t nSets = num_points / nPerSet;
83 
84  uint16_t* inputPtr = (uint16_t*)intsToSwap;
85 
86  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
87  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
88  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
89 
90  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
91 
92  for (number = 0; number < nSets; number++) {
93  // Load the 32t values, increment inputPtr later since we're doing it in-place.
94  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
95  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
96 
97  // Store the results
98  _mm256_store_si256((__m256i*)inputPtr, output);
99  inputPtr += nPerSet;
100  }
101 
102  // Byteswap any remaining points:
103  for (number = nPerSet * nSets; number < num_points; number++) {
104  uint16_t outputVal = *inputPtr;
105  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
106  *inputPtr = outputVal;
107  inputPtr++;
108  }
109 }
110 #endif /* LV_HAVE_AVX2 */
111 
112 
113 #if LV_HAVE_AVX2
114 #include <immintrin.h>
115 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
116 {
117  unsigned int number;
118 
119  const unsigned int nPerSet = 16;
120  const uint64_t nSets = num_points / nPerSet;
121 
122  uint16_t* inputPtr = (uint16_t*)intsToSwap;
123 
124  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
125  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
126  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
127 
128  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
129 
130  for (number = 0; number < nSets; number++) {
131  // Load the 32t values, increment inputPtr later since we're doing it in-place.
132  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
133  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
134 
135  // Store the results
136  _mm256_storeu_si256((__m256i*)inputPtr, output);
137  inputPtr += nPerSet;
138  }
139 
140  // Byteswap any remaining points:
141  for (number = nPerSet * nSets; number < num_points; number++) {
142  uint16_t outputVal = *inputPtr;
143  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
144  *inputPtr = outputVal;
145  inputPtr++;
146  }
147 }
148 #endif /* LV_HAVE_AVX2 */
149 
150 
151 #ifdef LV_HAVE_SSE2
152 #include <emmintrin.h>
153 
154 static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
155 {
156  unsigned int number = 0;
157  uint16_t* inputPtr = intsToSwap;
158  __m128i input, left, right, output;
159 
160  const unsigned int eighthPoints = num_points / 8;
161  for (; number < eighthPoints; number++) {
162  // Load the 16t values, increment inputPtr later since we're doing it in-place.
163  input = _mm_loadu_si128((__m128i*)inputPtr);
164  // Do the two shifts
165  left = _mm_slli_epi16(input, 8);
166  right = _mm_srli_epi16(input, 8);
167  // Or the left and right halves together
168  output = _mm_or_si128(left, right);
169  // Store the results
170  _mm_storeu_si128((__m128i*)inputPtr, output);
171  inputPtr += 8;
172  }
173 
174  // Byteswap any remaining points:
175  number = eighthPoints * 8;
176  for (; number < num_points; number++) {
177  uint16_t outputVal = *inputPtr;
178  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
179  *inputPtr = outputVal;
180  inputPtr++;
181  }
182 }
183 #endif /* LV_HAVE_SSE2 */
184 
185 
186 #endif /* INCLUDED_volk_16u_byteswap_u_H */
187 #ifndef INCLUDED_volk_16u_byteswap_a_H
188 #define INCLUDED_volk_16u_byteswap_a_H
189 
190 #include <inttypes.h>
191 #include <stdio.h>
192 
193 #ifdef LV_HAVE_SSE2
194 #include <emmintrin.h>
195 
196 static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
197 {
198  uint16_t* inputPtr = intsToSwap;
199  __m128i input, left, right, output;
200 
201  const unsigned int eighthPoints = num_points / 8;
202  for (unsigned int number = 0; number < eighthPoints; number++) {
203  // Load the 16t values, increment inputPtr later since we're doing it in-place.
204  input = _mm_load_si128((__m128i*)inputPtr);
205  // Do the two shifts
206  left = _mm_slli_epi16(input, 8);
207  right = _mm_srli_epi16(input, 8);
208  // Or the left and right halves together
209  output = _mm_or_si128(left, right);
210  // Store the results
211  _mm_store_si128((__m128i*)inputPtr, output);
212  inputPtr += 8;
213  }
214 
215  // Byteswap any remaining points:
216  volk_16u_byteswap_generic(inputPtr, num_points - eighthPoints * 8);
217 }
218 #endif /* LV_HAVE_SSE2 */
219 
220 #ifdef LV_HAVE_NEON
221 #include <arm_neon.h>
222 
223 static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
224 {
225  unsigned int number;
226  unsigned int eighth_points = num_points / 8;
227  uint16x8_t input, output;
228  uint16_t* inputPtr = intsToSwap;
229 
230  for (number = 0; number < eighth_points; number++) {
231  input = vld1q_u16(inputPtr);
232  output = vsriq_n_u16(output, input, 8);
233  output = vsliq_n_u16(output, input, 8);
234  vst1q_u16(inputPtr, output);
235  inputPtr += 8;
236  }
237 
238  volk_16u_byteswap_generic(inputPtr, num_points - eighth_points * 8);
239 }
240 #endif /* LV_HAVE_NEON */
241 
242 #ifdef LV_HAVE_NEON
243 #include <arm_neon.h>
244 
245 static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
246  unsigned int num_points)
247 {
248  uint16_t* inputPtr = intsToSwap;
249  unsigned int number = 0;
250  unsigned int n16points = num_points / 16;
251 
252  uint8x8x4_t input_table;
253  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
254  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
255 
256  /* these magic numbers are used as byte-indices in the LUT.
257  they are pre-computed to save time. A simple C program
258  can calculate them; for example for lookup01:
259  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
260  for(ii=0; ii < 8; ++ii) {
261  index += ((uint64_t)(*(chars+ii))) << (ii*8);
262  }
263  */
264  int_lookup01 = vcreate_u8(1232017111498883080);
265  int_lookup23 = vcreate_u8(1376697457175036426);
266  int_lookup45 = vcreate_u8(1521377802851189772);
267  int_lookup67 = vcreate_u8(1666058148527343118);
268 
269  for (number = 0; number < n16points; ++number) {
270  input_table = vld4_u8((uint8_t*)inputPtr);
271  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
272  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
273  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
274  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
275  vst1_u8((uint8_t*)inputPtr, swapped_int01);
276  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
277  vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
278  vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
279 
280  inputPtr += 16;
281  }
282 
283  volk_16u_byteswap_generic(inputPtr, num_points - n16points * 16);
284 }
285 #endif /* LV_HAVE_NEON */
286 
287 #ifdef LV_HAVE_GENERIC
288 
289 static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
290  unsigned int num_points)
291 {
292  uint16_t* inputPtr = intsToSwap;
293  for (unsigned int point = 0; point < num_points; point++) {
294  uint16_t output = *inputPtr;
295  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
296  *inputPtr = output;
297  inputPtr++;
298  }
299 }
300 #endif /* LV_HAVE_GENERIC */
301 
302 #ifdef LV_HAVE_ORC
303 
304 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
305 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
306 {
307  volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
308 }
309 #endif /* LV_HAVE_ORC */
310 
311 
312 #endif /* INCLUDED_volk_16u_byteswap_a_H */
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:154
static void volk_16u_byteswap_a_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:289
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:223
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:196
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:61
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:245