![]() |
NFFT
3.3.1
|
00001 /* 00002 * Copyright (c) 2002, 2016 Jens Keiner, Stefan Kunis, Daniel Potts 00003 * 00004 * This program is free software; you can redistribute it and/or modify it under 00005 * the terms of the GNU General Public License as published by the Free Software 00006 * Foundation; either version 2 of the License, or (at your option) any later 00007 * version. 00008 * 00009 * This program is distributed in the hope that it will be useful, but WITHOUT 00010 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 00011 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 00012 * details. 00013 * 00014 * You should have received a copy of the GNU General Public License along with 00015 * this program; if not, write to the Free Software Foundation, Inc., 51 00016 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00017 */ 00018 00019 00020 #ifdef _OPENMP 00021 #include <omp.h> 00022 #endif 00023 00024 #include "infft.h" 00025 00026 #define z_swap(_a_, _b_, _t_) do { (_t_) = (_a_); (_a_) = (_b_); (_b_) = (_t_); } while (0) 00027 00033 static inline void sort_node_indices_sort_bubble(INT n, INT *keys) 00034 { 00035 INT i, j, ti; 00036 00037 for (i = 0; i < n; ++i) 00038 { 00039 j = i; 00040 while (j > 0 && keys[2 * j + 0] < keys[2 * (j - 1) + 0]) 00041 { 00042 z_swap(keys[2 * j + 0], keys[2 * (j - 1) + 0], ti); 00043 z_swap(keys[2 * j + 1], keys[2 * (j - 1) + 1], ti); 00044 --j; 00045 } 00046 } 00047 } 00048 00054 static inline void sort_node_indices_radix_count(INT n, INT *keys, INT shift, INT mask, INT *counts) 00055 { 00056 INT i, k; 00057 00058 for (i = 0; i < n; ++i) 00059 { 00060 k = (keys[2 * i + 0] >> shift) & mask; 00061 ++counts[k]; 00062 } 00063 } 00064 00070 static inline void sort_node_indices_radix_rearrange(INT n, INT *keys_in, INT *keys_out, INT shift, INT mask, INT *displs) 00071 { 00072 INT i, k; 00073 00074 for (i = 0; i < n; ++i) 00075 { 00076 k = (keys_in[2 * i + 0] >> shift) & mask; 00077 keys_out[2 * displs[k] + 0] = keys_in[2 * i + 0]; 00078 keys_out[2 * displs[k] + 1] = keys_in[2 * i + 1]; 00079 ++displs[k]; 00080 } 00081 } 00082 00083 #define rwidth 9 00084 #define radix_n (1 << rwidth) 00085 00091 void Y(sort_node_indices_radix_lsdf)(INT n, INT *keys0, INT *keys1, INT rhigh) 00092 { 00093 const INT radix_mask = radix_n - 1; 00094 const INT rhigh_in = rhigh; 00095 00096 const INT tmax = 00097 #ifdef _OPENMP 00098 omp_get_max_threads(); 00099 #else 00100 1; 00101 #endif 00102 00103 INT *from, *to, *tmp; 00104 00105 INT i, k, l, h; 00106 INT *lcounts; 00107 00108 INT tid = 0, tnum = 1; 00109 00110 STACK_MALLOC(INT*, lcounts, (size_t)(tmax * radix_n) * sizeof(INT)); 00111 00112 from = keys0; 00113 to = keys1; 00114 00115 while (rhigh >= 0) 00116 { 00117 #ifdef _OPENMP 00118 #pragma omp parallel private(tid, tnum, i, l, h) 00119 { 00120 tid = omp_get_thread_num(); 00121 tnum = omp_get_num_threads(); 00122 #endif 00123 00124 for (i = 0; i < radix_n; ++i) lcounts[tid * radix_n + i] = 0; 00125 00126 l = (tid * n) / tnum; 00127 h = ((tid + 1) * n) / tnum; 00128 00129 sort_node_indices_radix_count(h - l, from + (2 * l), rhigh_in - rhigh, radix_mask, &lcounts[tid * radix_n]); 00130 #ifdef _OPENMP 00131 } 00132 #endif 00133 00134 k = 0; 00135 for (i = 0; i < radix_n; ++i) 00136 { 00137 for (l = 0; l < tmax; ++l) lcounts[l * radix_n + i] = (k += lcounts[l * radix_n + i]) - lcounts[l * radix_n + i]; 00138 } 00139 00140 #ifdef _OPENMP 00141 #pragma omp parallel private(tid, tnum, i, l, h) 00142 { 00143 tid = omp_get_thread_num(); 00144 tnum = omp_get_num_threads(); 00145 #endif 00146 00147 l = (tid * n) / tnum; 00148 h = ((tid + 1) * n) / tnum; 00149 00150 sort_node_indices_radix_rearrange(h - l, from + (2 * l), to, rhigh_in - rhigh, radix_mask, &lcounts[tid * radix_n]); 00151 #ifdef _OPENMP 00152 } 00153 #endif 00154 00155 /* print_keys(n, to);*/ 00156 00157 tmp = from; 00158 from = to; 00159 to = tmp; 00160 00161 rhigh -= rwidth; 00162 } 00163 00164 if (to == keys0) memcpy(to, from, (size_t)(n) * 2 * sizeof(INT)); 00165 00166 STACK_FREE(lcounts); 00167 } 00168 00174 void Y(sort_node_indices_radix_msdf)(INT n, INT *keys0, INT *keys1, INT rhigh) 00175 { 00176 const INT radix_mask = radix_n - 1; 00177 00178 const INT tmax = 00179 #ifdef _OPENMP 00180 omp_get_max_threads(); 00181 #else 00182 1; 00183 #endif 00184 00185 INT i, k, l, h; 00186 INT *lcounts; 00187 00188 INT counts[radix_n], displs[radix_n]; 00189 00190 INT tid = 0, tnum = 1; 00191 00192 STACK_MALLOC(INT*, lcounts, (size_t)(tmax * radix_n) * sizeof(INT)); 00193 00194 rhigh -= rwidth; 00195 00196 #ifdef _OPENMP 00197 #pragma omp parallel private(tid, tnum, i, l, h) 00198 { 00199 tid = omp_get_thread_num(); 00200 tnum = omp_get_num_threads(); 00201 #endif 00202 00203 for (i = 0; i < radix_n; ++i) lcounts[tid * radix_n + i] = 0; 00204 00205 l = (tid * n) / tnum; 00206 h = ((tid + 1) * n) / tnum; 00207 00208 sort_node_indices_radix_count(h - l, keys0 + (2 * l), rhigh + 1, radix_mask, &lcounts[tid * radix_n]); 00209 #ifdef _OPENMP 00210 } 00211 #endif 00212 00213 k = 0; 00214 for (i = 0; i < radix_n; ++i) 00215 { 00216 for (l = 0; l < tmax; ++l) lcounts[l * radix_n + i] = (k += lcounts[l * radix_n + i]) - lcounts[l * radix_n + i]; 00217 00218 displs[i] = lcounts[0 * radix_n + i]; 00219 if (i > 0) counts[i - 1] = displs[i] - displs[i - 1]; 00220 } 00221 counts[radix_n - 1] = n - displs[radix_n - 1]; 00222 00223 #ifdef _OPENMP 00224 #pragma omp parallel private(tid, tnum, i, l, h) 00225 { 00226 tid = omp_get_thread_num(); 00227 tnum = omp_get_num_threads(); 00228 #endif 00229 00230 l = (tid * n) / tnum; 00231 h = ((tid + 1) * n) / tnum; 00232 00233 sort_node_indices_radix_rearrange(h - l, keys0 + (2 * l), keys1, rhigh + 1, radix_mask, &lcounts[tid * radix_n]); 00234 #ifdef _OPENMP 00235 } 00236 #endif 00237 00238 memcpy(keys0, keys1, (size_t)(n) * 2 * sizeof(INT)); 00239 00240 if (rhigh >= 0) 00241 { 00242 for (i = 0; i < radix_n; ++i) 00243 { 00244 if (counts[i] > 1) 00245 { 00246 if (counts[i] > 256) 00247 Y(sort_node_indices_radix_msdf)(counts[i], keys0 + 2 * displs[i], keys1 + 2 * displs[i], rhigh); 00248 else 00249 sort_node_indices_sort_bubble(counts[i], keys0 + 2 * displs[i]); 00250 } 00251 } 00252 } 00253 00254 STACK_FREE(lcounts); 00255 }