M4RI
1.0.1
|
00001 00010 #ifndef M4RI_MZD 00011 #define M4RI_MZD 00012 00013 /******************************************************************* 00014 * 00015 * M4RI: Linear Algebra over GF(2) 00016 * 00017 * Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu> 00018 * Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk> 00019 * Copyright (C) 2011 Carlo Wood <carlo@alinoe.com> 00020 * 00021 * Distributed under the terms of the GNU General Public License (GPL) 00022 * version 2 or higher. 00023 * 00024 * This code is distributed in the hope that it will be useful, 00025 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00026 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00027 * General Public License for more details. 00028 * 00029 * The full text of the GPL is available at: 00030 * 00031 * http://www.gnu.org/licenses/ 00032 * 00033 ********************************************************************/ 00034 00035 #include "m4ri_config.h" 00036 00037 #include <math.h> 00038 #include <assert.h> 00039 #include <stdio.h> 00040 00041 #if __M4RI_HAVE_SSE2 00042 #include <emmintrin.h> 00043 #endif 00044 00045 #include "misc.h" 00046 #include "debug_dump.h" 00047 00048 #if __M4RI_HAVE_SSE2 00049 00056 #define __M4RI_SSE2_CUTOFF 10 00057 #endif 00058 00065 #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27) 00066 00075 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L2_CACHE))) / 2, 2048) 00076 00077 typedef struct { 00078 size_t size; 00079 word* begin; 00080 word* end; 00081 } mzd_block_t; 00082 00089 typedef struct mzd_t { 00094 rci_t nrows; 00095 00100 rci_t ncols; 00101 00108 wi_t width; 00109 00117 wi_t rowstride; 00118 00126 wi_t offset_vector; 00127 00132 wi_t row_offset; 00133 00138 uint16_t offset; 00139 00153 uint8_t flags; 00154 00160 uint8_t blockrows_log; 00161 00162 #if 0 // Commented out in order to keep the size of mzd_t 64 bytes (one cache line). This could be added back if rows was ever removed. 00163 00168 int blockrows_mask; 00169 #endif 00170 00175 word high_bitmask; 00176 00181 word low_bitmask; 00182 00188 mzd_block_t *blocks; 00189 00195 word **rows; 00196 00197 } mzd_t; 00198 00202 static wi_t const mzd_paddingwidth = 3; 00203 00204 static uint8_t const mzd_flag_nonzero_offset = 0x1; 00205 static uint8_t const mzd_flag_nonzero_excess = 0x2; 00206 static uint8_t const mzd_flag_windowed_zerooffset = 0x4; 00207 static uint8_t const mzd_flag_windowed_zeroexcess = 0x8; 00208 static uint8_t const mzd_flag_windowed_ownsblocks = 0x10; 00209 static uint8_t const mzd_flag_multiple_blocks = 0x20; 00210 00218 static inline int mzd_is_windowed(mzd_t const *M) { 00219 return M->flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset); 00220 } 00221 00229 static inline int mzd_owns_blocks(mzd_t const *M) { 00230 return M->blocks && (!mzd_is_windowed(M) || ((M->flags & mzd_flag_windowed_ownsblocks))); 00231 } 00232 00241 static inline word* mzd_first_row(mzd_t const *M) { 00242 word* result = M->blocks[0].begin + M->offset_vector; 00243 assert(M->nrows == 0 || result == M->rows[0]); 00244 return result; 00245 } 00246 00257 static inline word* mzd_first_row_next_block(mzd_t const* M, int n) { 00258 assert(n > 0); 00259 return M->blocks[n].begin + M->offset_vector - M->row_offset * M->rowstride; 00260 } 00261 00271 static inline int mzd_row_to_block(mzd_t const* M, rci_t row) { 00272 return (M->row_offset + row) >> M->blockrows_log; 00273 } 00274 00288 static inline wi_t mzd_rows_in_block(mzd_t const* M, int n) { 00289 if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) { 00290 if (__M4RI_UNLIKELY(n == 0)) { 00291 return (1 << M->blockrows_log) - M->row_offset; 00292 } else { 00293 int const last_block = mzd_row_to_block(M, M->nrows - 1); 00294 if (n < last_block) 00295 return (1 << M->blockrows_log); 00296 return M->nrows + M->row_offset - (n << M->blockrows_log); 00297 } 00298 } 00299 return n ? 0 : M->nrows; 00300 } 00301 00311 static inline word* mzd_row(mzd_t const* M, rci_t row) { 00312 wi_t big_vector = M->offset_vector + row * M->rowstride; 00313 word* result = M->blocks[0].begin + big_vector; 00314 if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) { 00315 int const n = (M->row_offset + row) >> M->blockrows_log; 00316 result = M->blocks[n].begin + big_vector - n * (M->blocks[0].size / sizeof(word)); 00317 } 00318 assert(result == M->rows[row]); 00319 return result; 00320 } 00321 00332 mzd_t *mzd_init(rci_t const r, rci_t const c); 00333 00340 void mzd_free(mzd_t *A); 00341 00342 00364 mzd_t *mzd_init_window(mzd_t *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc); 00365 00372 static inline mzd_t const *mzd_init_window_const(mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc) 00373 { 00374 return mzd_init_window((mzd_t*)M, lowr, lowc, highr, highc); 00375 } 00376 00383 #define mzd_free_window mzd_free 00384 00394 static inline void _mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb, wi_t const startblock) { 00395 if ((rowa == rowb) || (startblock >= M->width)) 00396 return; 00397 00398 /* This is the case since we're only called from _mzd_ple_mmpf, 00399 * which makes the same assumption. Therefore we don't need 00400 * to take a mask_begin into account. */ 00401 assert(M->offset == 0); 00402 00403 wi_t width = M->width - startblock - 1; 00404 word *a = M->rows[rowa] + startblock; 00405 word *b = M->rows[rowb] + startblock; 00406 word tmp; 00407 word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); 00408 00409 if (width != 0) { 00410 for(wi_t i = 0; i < width; ++i) { 00411 tmp = a[i]; 00412 a[i] = b[i]; 00413 b[i] = tmp; 00414 } 00415 } 00416 tmp = (a[width] ^ b[width]) & mask_end; 00417 a[width] ^= tmp; 00418 b[width] ^= tmp; 00419 00420 __M4RI_DD_ROW(M, rowa); 00421 __M4RI_DD_ROW(M, rowb); 00422 } 00423 00432 static inline void mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb) { 00433 if(rowa == rowb) 00434 return; 00435 00436 wi_t width = M->width - 1; 00437 word *a = M->rows[rowa]; 00438 word *b = M->rows[rowb]; 00439 word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - M->offset); 00440 word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); 00441 00442 word tmp = (a[0] ^ b[0]) & mask_begin; 00443 if (width != 0) { 00444 a[0] ^= tmp; 00445 b[0] ^= tmp; 00446 00447 for(wi_t i = 1; i < width; ++i) { 00448 tmp = a[i]; 00449 a[i] = b[i]; 00450 b[i] = tmp; 00451 } 00452 tmp = (a[width] ^ b[width]) & mask_end; 00453 a[width] ^= tmp; 00454 b[width] ^= tmp; 00455 00456 } else { 00457 tmp &= mask_end; 00458 a[0] ^= tmp; 00459 b[0] ^= tmp; 00460 } 00461 00462 __M4RI_DD_ROW(M, rowa); 00463 __M4RI_DD_ROW(M, rowb); 00464 } 00465 00478 void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j); 00479 00488 void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb); 00489 00500 static inline void mzd_col_swap_in_rows(mzd_t *M, rci_t const cola, rci_t const colb, rci_t const start_row, rci_t const stop_row) { 00501 if (cola == colb) 00502 return; 00503 00504 rci_t const _cola = cola + M->offset; 00505 rci_t const _colb = colb + M->offset; 00506 00507 wi_t const a_word = _cola / m4ri_radix; 00508 wi_t const b_word = _colb / m4ri_radix; 00509 00510 int const a_bit = _cola % m4ri_radix; 00511 int const b_bit = _colb % m4ri_radix; 00512 00513 word* RESTRICT ptr = mzd_row(M, start_row); 00514 int max_bit = MAX(a_bit, b_bit); 00515 int count_remaining = stop_row - start_row; 00516 int min_bit = a_bit + b_bit - max_bit; 00517 int block = mzd_row_to_block(M, start_row); 00518 int offset = max_bit - min_bit; 00519 word mask = m4ri_one << min_bit; 00520 int count = MIN(mzd_rows_in_block(M, 0), count_remaining); 00521 // Apparently we're calling with start_row == stop_row sometimes (seems a bug to me). 00522 if (count <= 0) 00523 return; 00524 00525 if (a_word == b_word) { 00526 while(1) { 00527 count_remaining -= count; 00528 ptr += a_word; 00529 int fast_count = count / 4; 00530 int rest_count = count - 4 * fast_count; 00531 word xor_v[4]; 00532 wi_t const rowstride = M->rowstride; 00533 while (fast_count--) { 00534 xor_v[0] = ptr[0]; 00535 xor_v[1] = ptr[rowstride]; 00536 xor_v[2] = ptr[2 * rowstride]; 00537 xor_v[3] = ptr[3 * rowstride]; 00538 xor_v[0] ^= xor_v[0] >> offset; 00539 xor_v[1] ^= xor_v[1] >> offset; 00540 xor_v[2] ^= xor_v[2] >> offset; 00541 xor_v[3] ^= xor_v[3] >> offset; 00542 xor_v[0] &= mask; 00543 xor_v[1] &= mask; 00544 xor_v[2] &= mask; 00545 xor_v[3] &= mask; 00546 xor_v[0] |= xor_v[0] << offset; 00547 xor_v[1] |= xor_v[1] << offset; 00548 xor_v[2] |= xor_v[2] << offset; 00549 xor_v[3] |= xor_v[3] << offset; 00550 ptr[0] ^= xor_v[0]; 00551 ptr[rowstride] ^= xor_v[1]; 00552 ptr[2 * rowstride] ^= xor_v[2]; 00553 ptr[3 * rowstride] ^= xor_v[3]; 00554 ptr += 4 * rowstride; 00555 } 00556 while (rest_count--) { 00557 word xor_v = *ptr; 00558 xor_v ^= xor_v >> offset; 00559 xor_v &= mask; 00560 *ptr ^= xor_v | (xor_v << offset); 00561 ptr += rowstride; 00562 } 00563 if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0) 00564 break; 00565 ptr = mzd_first_row_next_block(M, block); 00566 } 00567 } else { 00568 word* RESTRICT min_ptr; 00569 wi_t max_offset; 00570 if (min_bit == a_bit) { 00571 min_ptr = ptr + a_word; 00572 max_offset = b_word - a_word; 00573 } else { 00574 min_ptr = ptr + b_word; 00575 max_offset = a_word - b_word; 00576 } 00577 while(1) { 00578 count_remaining -= count; 00579 wi_t const rowstride = M->rowstride; 00580 while(count--) { 00581 word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask; 00582 min_ptr[0] ^= xor_v; 00583 min_ptr[max_offset] ^= xor_v << offset; 00584 min_ptr += rowstride; 00585 } 00586 if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0) 00587 break; 00588 ptr = mzd_first_row_next_block(M, block); 00589 if (min_bit == a_bit) 00590 min_ptr = ptr + a_word; 00591 else 00592 min_ptr = ptr + b_word; 00593 } 00594 } 00595 00596 __M4RI_DD_MZD(M); 00597 } 00598 00610 static inline BIT mzd_read_bit(mzd_t const *M, rci_t const row, rci_t const col ) { 00611 return __M4RI_GET_BIT(M->rows[row][(col+M->offset)/m4ri_radix], (col+M->offset) % m4ri_radix); 00612 } 00613 00626 static inline void mzd_write_bit(mzd_t *M, rci_t const row, rci_t const col, BIT const value) { 00627 __M4RI_WRITE_BIT(M->rows[row][(col + M->offset) / m4ri_radix], (col + M->offset) % m4ri_radix, value); 00628 } 00629 00630 00641 static inline void mzd_xor_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) { 00642 int const spot = (y + M->offset) % m4ri_radix; 00643 wi_t const block = (y + M->offset) / m4ri_radix; 00644 M->rows[x][block] ^= values << spot; 00645 int const space = m4ri_radix - spot; 00646 if (n > space) 00647 M->rows[x][block + 1] ^= values >> space; 00648 } 00649 00660 static inline void mzd_and_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) { 00661 /* This is the best way, since this will drop out once we inverse the bits in values: */ 00662 values >>= (m4ri_radix - n); /* Move the bits to the lowest columns */ 00663 00664 int const spot = (y + M->offset) % m4ri_radix; 00665 wi_t const block = (y + M->offset) / m4ri_radix; 00666 M->rows[x][block] &= values << spot; 00667 int const space = m4ri_radix - spot; 00668 if (n > space) 00669 M->rows[x][block + 1] &= values >> space; 00670 } 00671 00681 static inline void mzd_clear_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) { 00682 assert(n>0 && n <= m4ri_radix); 00683 word values = m4ri_ffff >> (m4ri_radix - n); 00684 int const spot = (y + M->offset) % m4ri_radix; 00685 wi_t const block = (y + M->offset) / m4ri_radix; 00686 M->rows[x][block] &= ~(values << spot); 00687 int const space = m4ri_radix - spot; 00688 if (n > space) 00689 M->rows[x][block + 1] &= ~(values >> space); 00690 } 00691 00704 static inline void mzd_row_add_offset(mzd_t *M, rci_t dstrow, rci_t srcrow, rci_t coloffset) { 00705 assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols); 00706 coloffset += M->offset; 00707 wi_t const startblock= coloffset/m4ri_radix; 00708 wi_t wide = M->width - startblock; 00709 word *src = M->rows[srcrow] + startblock; 00710 word *dst = M->rows[dstrow] + startblock; 00711 word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset % m4ri_radix); 00712 word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix); 00713 00714 *dst++ ^= *src++ & mask_begin; 00715 --wide; 00716 00717 #if __M4RI_HAVE_SSE2 00718 int not_aligned = __M4RI_ALIGNMENT(src,16) != 0; /* 0: Aligned, 1: Not aligned */ 00719 if (wide > not_aligned + 1) /* Speed up for small matrices */ 00720 { 00721 if (not_aligned) { 00722 *dst++ ^= *src++; 00723 --wide; 00724 } 00725 /* Now wide > 1 */ 00726 __m128i* __src = (__m128i*)src; 00727 __m128i* __dst = (__m128i*)dst; 00728 __m128i* const eof = (__m128i*)((unsigned long)(src + wide) & ~0xFUL); 00729 do 00730 { 00731 __m128i xmm1 = _mm_xor_si128(*__dst, *__src); 00732 *__dst++ = xmm1; 00733 } 00734 while(++__src < eof); 00735 src = (word*)__src; 00736 dst = (word*)__dst; 00737 wide = ((sizeof(word)*wide)%16)/sizeof(word); 00738 } 00739 #endif 00740 wi_t i = -1; 00741 while(++i < wide) 00742 dst[i] ^= src[i]; 00743 /* 00744 * Revert possibly non-zero excess bits. 00745 * Note that i == wide here, and wide can be 0. 00746 * But really, src[wide - 1] is M->rows[srcrow][M->width - 1] ;) 00747 * We use i - 1 here to let the compiler know these are the same addresses 00748 * that we last accessed, in the previous loop. 00749 */ 00750 dst[i - 1] ^= src[i - 1] & ~mask_end; 00751 00752 __M4RI_DD_ROW(M, dstrow); 00753 } 00754 00766 void mzd_row_add(mzd_t *M, rci_t const sourcerow, rci_t const destrow); 00767 00782 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A); 00783 00798 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B); 00799 00814 mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B); 00815 00827 mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, int const clear); 00828 00838 mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear); 00839 00850 void mzd_randomize(mzd_t *M); 00851 00866 void mzd_set_ui(mzd_t *M, unsigned int const value); 00867 00883 rci_t mzd_gauss_delayed(mzd_t *M, rci_t const startcol, int const full); 00884 00900 rci_t mzd_echelonize_naive(mzd_t *M, int const full); 00901 00911 int mzd_equal(mzd_t const *A, mzd_t const *B); 00912 00926 int mzd_cmp(mzd_t const *A, mzd_t const *B); 00927 00935 mzd_t *mzd_copy(mzd_t *DST, mzd_t const *A); 00936 00957 mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B); 00958 00978 mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B); 00979 00992 mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc); 00993 01007 mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I); 01008 01020 mzd_t *mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B); 01021 01032 mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B); 01033 01044 #define mzd_sub mzd_add 01045 01056 #define _mzd_sub _mzd_add 01057 01058 01059 01069 static inline word mzd_read_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) { 01070 int const spot = (y + M->offset) % m4ri_radix; 01071 wi_t const block = (y + M->offset) / m4ri_radix; 01072 int const spill = spot + n - m4ri_radix; 01073 word temp = (spill <= 0) ? M->rows[x][block] << -spill : (M->rows[x][block + 1] << (m4ri_radix - spill)) | (M->rows[x][block] >> spill); 01074 return temp >> (m4ri_radix - n); 01075 } 01076 01077 01097 void mzd_combine(mzd_t *DST, rci_t const row3, wi_t const startblock3, 01098 mzd_t const *SC1, rci_t const row1, wi_t const startblock1, 01099 mzd_t const *SC2, rci_t const row2, wi_t const startblock2); 01100 01101 01121 static inline void mzd_combine_weird(mzd_t *C, rci_t const c_row, wi_t const c_startblock, 01122 mzd_t const *A, rci_t const a_row, wi_t const a_startblock, 01123 mzd_t const *B, rci_t const b_row, wi_t const b_startblock) { 01124 word tmp; 01125 rci_t i = 0; 01126 01127 01128 for(; i + m4ri_radix <= A->ncols; i += m4ri_radix) { 01129 tmp = mzd_read_bits(A, a_row, i, m4ri_radix) ^ mzd_read_bits(B, b_row, i, m4ri_radix); 01130 mzd_clear_bits(C, c_row, i, m4ri_radix); 01131 mzd_xor_bits(C, c_row, i, m4ri_radix, tmp); 01132 } 01133 if(A->ncols - i) { 01134 tmp = mzd_read_bits(A, a_row, i, (A->ncols - i)) ^ mzd_read_bits(B, b_row, i, (B->ncols - i)); 01135 mzd_clear_bits(C, c_row, i, (C->ncols - i)); 01136 mzd_xor_bits(C, c_row, i, (C->ncols - i), tmp); 01137 } 01138 01139 __M4RI_DD_MZD(C); 01140 } 01141 01158 static inline void mzd_combine_even_in_place(mzd_t *A, rci_t const a_row, wi_t const a_startblock, 01159 mzd_t const *B, rci_t const b_row, wi_t const b_startblock) { 01160 01161 wi_t wide = A->width - a_startblock - 1; 01162 01163 word *a = A->rows[a_row] + a_startblock; 01164 word *b = B->rows[b_row] + b_startblock; 01165 01166 #if __M4RI_HAVE_SSE2 01167 if(wide > __M4RI_SSE2_CUTOFF) { 01169 if (__M4RI_ALIGNMENT(a,16)) { 01170 *a++ ^= *b++; 01171 wide--; 01172 } 01173 01174 if (__M4RI_ALIGNMENT(a, 16) == 0 && __M4RI_ALIGNMENT(b, 16) == 0) { 01175 __m128i *a128 = (__m128i*)a; 01176 __m128i *b128 = (__m128i*)b; 01177 const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL); 01178 01179 do { 01180 *a128 = _mm_xor_si128(*a128, *b128); 01181 ++b128; 01182 ++a128; 01183 } while(a128 < eof); 01184 01185 a = (word*)a128; 01186 b = (word*)b128; 01187 wide = ((sizeof(word) * wide) % 16) / sizeof(word); 01188 } 01189 } 01190 #endif // __M4RI_HAVE_SSE2 01191 01192 if (wide > 0) { 01193 wi_t n = (wide + 7) / 8; 01194 switch (wide % 8) { 01195 case 0: do { *(a++) ^= *(b++); 01196 case 7: *(a++) ^= *(b++); 01197 case 6: *(a++) ^= *(b++); 01198 case 5: *(a++) ^= *(b++); 01199 case 4: *(a++) ^= *(b++); 01200 case 3: *(a++) ^= *(b++); 01201 case 2: *(a++) ^= *(b++); 01202 case 1: *(a++) ^= *(b++); 01203 } while (--n > 0); 01204 } 01205 } 01206 01207 *a ^= *b & __M4RI_LEFT_BITMASK(A->ncols%m4ri_radix); 01208 01209 __M4RI_DD_MZD(A); 01210 } 01211 01212 01232 static inline void mzd_combine_even(mzd_t *C, rci_t const c_row, wi_t const c_startblock, 01233 mzd_t const *A, rci_t const a_row, wi_t const a_startblock, 01234 mzd_t const *B, rci_t const b_row, wi_t const b_startblock) { 01235 01236 wi_t wide = A->width - a_startblock - 1; 01237 word *a = A->rows[a_row] + a_startblock; 01238 word *b = B->rows[b_row] + b_startblock; 01239 word *c = C->rows[c_row] + c_startblock; 01240 01241 /* /\* this is a corner case triggered by Strassen multiplication */ 01242 /* * which assumes certain (virtual) matrix sizes */ 01243 /* * 2011/03/07: I don't think this was ever correct *\/ */ 01244 /* if (a_row >= A->nrows) { */ 01245 /* assert(a_row < A->nrows); */ 01246 /* for(wi_t i = 0; i < wide; ++i) { */ 01247 /* c[i] = b[i]; */ 01248 /* } */ 01249 /* } else { */ 01250 #if __M4RI_HAVE_SSE2 01251 if(wide > __M4RI_SSE2_CUTOFF) { 01253 if (__M4RI_ALIGNMENT(a,16)) { 01254 *c++ = *b++ ^ *a++; 01255 wide--; 01256 } 01257 01258 if ( (__M4RI_ALIGNMENT(b, 16) | __M4RI_ALIGNMENT(c, 16)) == 0) { 01259 __m128i *a128 = (__m128i*)a; 01260 __m128i *b128 = (__m128i*)b; 01261 __m128i *c128 = (__m128i*)c; 01262 const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL); 01263 01264 do { 01265 *c128 = _mm_xor_si128(*a128, *b128); 01266 ++c128; 01267 ++b128; 01268 ++a128; 01269 } while(a128 < eof); 01270 01271 a = (word*)a128; 01272 b = (word*)b128; 01273 c = (word*)c128; 01274 wide = ((sizeof(word) * wide) % 16) / sizeof(word); 01275 } 01276 } 01277 #endif // __M4RI_HAVE_SSE2 01278 01279 if (wide > 0) { 01280 wi_t n = (wide + 7) / 8; 01281 switch (wide % 8) { 01282 case 0: do { *(c++) = *(a++) ^ *(b++); 01283 case 7: *(c++) = *(a++) ^ *(b++); 01284 case 6: *(c++) = *(a++) ^ *(b++); 01285 case 5: *(c++) = *(a++) ^ *(b++); 01286 case 4: *(c++) = *(a++) ^ *(b++); 01287 case 3: *(c++) = *(a++) ^ *(b++); 01288 case 2: *(c++) = *(a++) ^ *(b++); 01289 case 1: *(c++) = *(a++) ^ *(b++); 01290 } while (--n > 0); 01291 } 01292 } 01293 *c ^= ((*a ^ *b ^ *c) & __M4RI_LEFT_BITMASK(C->ncols%m4ri_radix)); 01294 01295 __M4RI_DD_MZD(C); 01296 } 01297 01298 01307 static inline int mzd_read_bits_int(mzd_t const *M, rci_t const x, rci_t const y, int const n) 01308 { 01309 return __M4RI_CONVERT_TO_INT(mzd_read_bits(M, x, y, n)); 01310 } 01311 01312 01319 int mzd_is_zero(mzd_t const *A); 01320 01329 void mzd_row_clear_offset(mzd_t *M, rci_t const row, rci_t const coloffset); 01330 01347 int mzd_find_pivot(mzd_t const *M, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c); 01348 01349 01362 double mzd_density(mzd_t const *A, wi_t res); 01363 01378 double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c); 01379 01380 01389 rci_t mzd_first_zero_row(mzd_t const *A); 01390 01397 static inline unsigned long long mzd_hash(mzd_t const *A) { 01398 unsigned long long hash = 0; 01399 for (rci_t r = 0; r < A->nrows; ++r) 01400 hash ^= rotate_word(calculate_hash(A->rows[r], A->width), r % m4ri_radix); 01401 return hash; 01402 } 01403 01413 mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A); 01414 01424 mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A); 01425 01426 #endif // M4RI_MZD