16 #ifndef TLX_SORT_STRINGS_SAMPLE_SORT_TOOLS_HEADER
17 #define TLX_SORT_STRINGS_SAMPLE_SORT_TOOLS_HEADER
33 namespace sort_strings_detail {
38 template <
typename Type>
40 std::string
to_binary(Type v,
const size_t width = (8 *
sizeof(Type))) {
41 std::string str(width,
' ');
42 for (
size_t i = 0; i < width; i++) {
43 str[width - i - 1] = (v & 1) ?
'1' :
'0';
50 template <
size_t TreeBits>
51 struct PerfectTreeCalculations {
52 static const bool debug =
false;
54 static const size_t treebits = TreeBits;
63 int hi =
treebits - 32 + clz<uint32_t>(
id);
66 unsigned int bkt = ((
id << (hi + 1)) & bitmask) | (1 << hi);
78 int lo = ctz<uint32_t>(
id) + 1;
81 unsigned int bkt = ((
id >> lo) & bitmask) | (1 << (
treebits - lo));
121 template <
typename key_type,
size_t num_splitters>
130 key_type tree[num_splitters + 1],
131 unsigned char splitter_lcp[num_splitters + 1],
132 const key_type* samples,
size_t samplesize)
135 key_type sentinel = 0;
136 recurse(samples, samples + samplesize, 1, sentinel);
138 assert(
splitter_ == splitter + num_splitters);
139 assert(
lcp_iter_ == splitter_lcp + num_splitters);
141 splitter_lcp[0] &= 0x80;
143 splitter_lcp[num_splitters] = 0;
146 ptrdiff_t
snum(
const key_type* s)
const {
147 return static_cast<ptrdiff_t
>(s -
samples_);
150 key_type
recurse(
const key_type* lo,
const key_type* hi,
151 unsigned int treeidx, key_type& rec_prevkey) {
153 <<
"rec_buildtree(" <<
snum(lo) <<
"," <<
snum(hi)
154 <<
", treeidx=" << treeidx <<
")";
157 const key_type* mid = lo +
static_cast<ptrdiff_t
>(hi - lo) / 2;
160 <<
"tree[" << treeidx <<
"] = samples[" <<
snum(mid) <<
"] = "
163 key_type mykey =
tree_[treeidx] = *mid;
165 const key_type* midlo = mid;
166 while (lo < midlo && *(midlo - 1) == mykey) midlo--;
168 const key_type* midhi = mid;
169 while (midhi + 1 < hi && *midhi == mykey) midhi++;
171 if (midhi - midlo > 1)
174 const key_type* midlo = mid, * midhi = mid + 1;
176 if (2 * treeidx < num_splitters)
178 key_type prevkey =
recurse(lo, midlo, 2 * treeidx + 0, rec_prevkey);
180 key_type xorSplit = prevkey ^ mykey;
186 <<
" - " <<
clz(xorSplit)
187 <<
" bits = " <<
clz(xorSplit) / 8
193 (
clz(xorSplit) / 8) |
195 ((mykey & 0xFF) ? 0 : 0x80);
197 return recurse(midhi, hi, 2 * treeidx + 1, mykey);
201 key_type xorSplit = rec_prevkey ^ mykey;
207 <<
" - " <<
clz(xorSplit)
208 <<
" bits = " <<
clz(xorSplit) / 8
214 (
clz(xorSplit) / 8) |
216 ((mykey & 0xFF) ? 0 : 0x80);
231 template <
typename key_type,
size_t num_splitters>
232 class SSTreeBuilderLevelOrder
239 unsigned char splitter_lcp[num_splitters + 1],
240 const key_type* samples,
size_t samplesize)
244 key_type sentinel = 0;
245 recurse(samples, samples + samplesize, 1, sentinel);
247 assert(
lcp_iter_ == splitter_lcp + num_splitters);
249 splitter_lcp[0] &= 0x80;
251 splitter_lcp[num_splitters] = 0;
254 ptrdiff_t
snum(
const key_type* s)
const {
255 return static_cast<ptrdiff_t
>(s -
samples_);
258 key_type
recurse(
const key_type* lo,
const key_type* hi,
259 unsigned int treeidx, key_type& rec_prevkey) {
261 <<
"rec_buildtree(" <<
snum(lo) <<
"," <<
snum(hi)
262 <<
", treeidx=" << treeidx <<
")";
265 const key_type* mid = lo +
static_cast<ptrdiff_t
>(hi - lo) / 2;
268 <<
"tree[" << treeidx <<
"] = samples[" <<
snum(mid) <<
"] = "
271 key_type mykey =
tree_[treeidx] = *mid;
273 const key_type* midlo = mid;
274 while (lo < midlo && *(midlo - 1) == mykey) midlo--;
276 const key_type* midhi = mid;
277 while (midhi + 1 < hi && *midhi == mykey) midhi++;
279 if (midhi - midlo > 1)
282 const key_type* midlo = mid, * midhi = mid + 1;
284 if (2 * treeidx < num_splitters)
286 key_type prevkey =
recurse(lo, midlo, 2 * treeidx + 0, rec_prevkey);
288 key_type xorSplit = prevkey ^ mykey;
294 <<
" - " <<
clz(xorSplit)
295 <<
" bits = " <<
clz(xorSplit) / 8
299 (
clz(xorSplit) / 8) |
301 ((mykey & 0xFF) ? 0 : 0x80);
303 return recurse(midhi, hi, 2 * treeidx + 1, mykey);
307 key_type xorSplit = rec_prevkey ^ mykey;
313 <<
" - " <<
clz(xorSplit)
314 <<
" bits = " <<
clz(xorSplit) / 8
318 (
clz(xorSplit) / 8) |
320 ((mykey & 0xFF) ? 0 : 0x80);
335 template <
typename key_type,
size_t TreeBits,
size_t Rollout = 4>
336 class SSClassifyTreeUnrollInterleave
339 static const size_t treebits = TreeBits;
343 void build(key_type* samples,
size_t samplesize,
344 unsigned char* splitter_lcp) {
345 SSTreeBuilderPreAndLevelOrder<key_type, num_splitters>(
347 samples, samplesize);
351 unsigned int find_bkt(
const key_type& key)
const {
368 #define TLX_CLASSIFY_TREE_STEP \
369 for (size_t u = 0; u < Rollout; ++u) { \
370 i[u] = 2 * i[u] + (key[u] <= splitter_tree_[i[u]] ? 0 : 1); \
372 TLX_ATTRIBUTE_FALLTHROUGH;
377 const key_type key[Rollout], uint16_t obkt[Rollout])
const {
380 unsigned int i[Rollout];
381 std::fill(i, i + Rollout, 1u);
421 for (
size_t u = 0; u < Rollout; ++u)
424 for (
size_t u = 0; u < Rollout; ++u) {
429 for (
size_t u = 0; u < Rollout; ++u) {
436 #undef TLX_CLASSIFY_TREE_STEP
439 template <
typename StringSet>
442 const StringSet& strset,
443 typename StringSet::Iterator begin,
typename StringSet::Iterator end,
444 uint16_t* bktout,
size_t depth)
const {
447 if (begin + Rollout <= end)
449 key_type key[Rollout];
450 for (
size_t u = 0; u < Rollout; ++u)
451 key[u] = get_key<key_type>(strset, begin[u], depth);
455 begin += Rollout, bktout += Rollout;
459 key_type key = get_key<key_type>(strset, *begin++, depth);
477 template <
typename key_type,
size_t TreeBits>
478 class SSClassifyEqualUnroll
481 static const size_t treebits = TreeBits;
485 void build(key_type* samples,
size_t samplesize,
unsigned char* splitter_lcp) {
486 SSTreeBuilderLevelOrder<key_type, num_splitters>(
490 #define TLX_CLASSIFY_TREE_STEP \
491 if (TLX_UNLIKELY(key == splitter_tree_[i])) { \
493 2 * PerfectTreeCalculations<treebits>::level_to_preorder(i) - 1; \
495 i = 2 * i + (key < splitter_tree_[i] ? 0 : 1); \
496 TLX_ATTRIBUTE_FALLTHROUGH;
499 unsigned int find_bkt(
const key_type& key)
const {
546 #undef TLX_CLASSIFY_TREE_STEP
549 template <
typename StringSet>
551 const StringSet& strset,
552 typename StringSet::Iterator begin,
typename StringSet::Iterator end,
553 uint16_t* bktout,
size_t depth)
const {
556 key_type key = get_key<key_type>(strset, *begin++, depth);
575 template <
typename key_type,
size_t TreeBits,
unsigned Rollout = 4>
579 static const size_t treebits = TreeBits;
583 void build(key_type* samples,
size_t samplesize,
584 unsigned char* splitter_lcp) {
590 unsigned int find_bkt(
const key_type& key)
const {
613 #define TLX_CLASSIFY_TREE_STEP \
614 for (size_t u = 0; u < Rollout; ++u) { \
615 i[u] = 2 * i[u] + (key[u] <= splitter_tree_[i[u]] ? 0 : 1); \
617 TLX_ATTRIBUTE_FALLTHROUGH;
622 const key_type key[Rollout], uint16_t obkt[Rollout])
const {
625 unsigned int i[Rollout];
626 std::fill(i + 0, i + Rollout, 1);
667 for (
unsigned u = 0; u < Rollout; ++u)
670 for (
unsigned u = 0; u < Rollout; ++u) {
675 for (
unsigned u = 0; u < Rollout; ++u) {
682 #undef TLX_CLASSIFY_TREE_STEP
685 template <
typename StringSet>
688 const StringSet& strset,
689 typename StringSet::Iterator begin,
typename StringSet::Iterator end,
690 uint16_t* bktout,
size_t depth)
const {
693 if (begin + Rollout <= end)
695 key_type key[Rollout];
696 for (
size_t u = 0; u < Rollout; ++u)
697 key[u] = get_key<key_type>(strset, begin[u], depth);
701 begin += Rollout, bktout += Rollout;
705 key_type key = get_key<key_type>(strset, *begin++, depth);
726 #endif // !TLX_SORT_STRINGS_SAMPLE_SORT_TOOLS_HEADER