librsync  2.3.0
sumset.c
1 /*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*-
2  *
3  * librsync -- library for network deltas
4  *
5  * Copyright (C) 1999, 2000, 2001 by Martin Pool <mbp@sourcefrog.net>
6  * Copyright (C) 1999 by Andrew Tridgell
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation; either version 2.1 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  */
22 
23 #include "config.h"
24 #include <assert.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include "librsync.h"
28 #include "sumset.h"
29 #include "trace.h"
30 #include "util.h"
31 
32 static void rs_block_sig_init(rs_block_sig_t *sig, rs_weak_sum_t weak_sum,
33  rs_strong_sum_t *strong_sum, int strong_len)
34 {
35  sig->weak_sum = weak_sum;
36  if (strong_sum)
37  memcpy(sig->strong_sum, strong_sum, strong_len);
38 }
39 
40 static inline unsigned rs_block_sig_hash(const rs_block_sig_t *sig)
41 {
42  return (unsigned)sig->weak_sum;
43 }
44 
45 typedef struct rs_block_match {
46  rs_block_sig_t block_sig;
47  rs_signature_t *signature;
48  const void *buf;
49  size_t len;
51 
52 static void rs_block_match_init(rs_block_match_t *match, rs_signature_t *sig,
53  rs_weak_sum_t weak_sum,
54  rs_strong_sum_t *strong_sum, const void *buf,
55  size_t len)
56 {
57  rs_block_sig_init(&match->block_sig, weak_sum, strong_sum,
58  sig->strong_sum_len);
59  match->signature = sig;
60  match->buf = buf;
61  match->len = len;
62 }
63 
64 static inline int rs_block_match_cmp(rs_block_match_t *match,
65  const rs_block_sig_t *block_sig)
66 {
67  /* If buf is not NULL, the strong sum is yet to be calculated. */
68  if (match->buf) {
69 #ifndef HASHTABLE_NSTATS
70  match->signature->calc_strong_count++;
71 #endif
72  rs_signature_calc_strong_sum(match->signature, match->buf, match->len,
73  &(match->block_sig.strong_sum));
74  match->buf = NULL;
75  }
76  return memcmp(&match->block_sig.strong_sum, &block_sig->strong_sum,
77  match->signature->strong_sum_len);
78 }
79 
80 /* Disable mix32() in the hashtable because RabinKarp doesn't need it. We
81  manually apply mix32() to rollsums before using them in the hashtable. */
82 #define HASHTABLE_NMIX32
83 /* Instantiate hashtable for rs_block_sig and rs_block_match. */
84 #define ENTRY rs_block_sig
85 #define MATCH rs_block_match
86 #define NAME hashtable
87 #include "hashtable.h"
88 
89 /* Get the size of a packed rs_block_sig_t. */
90 static inline size_t rs_block_sig_size(const rs_signature_t *sig)
91 {
92  /* Round up to next multiple of sizeof(weak_sum) to align memory correctly.
93  */
94  return offsetof(rs_block_sig_t,
95  strong_sum) + ((sig->strong_sum_len +
96  sizeof(rs_weak_sum_t)-
97  1) / sizeof(rs_weak_sum_t)) *
98  sizeof(rs_weak_sum_t);
99 }
100 
101 /* Get the pointer to the block_sig_t from a block index. */
102 static inline rs_block_sig_t *rs_block_sig_ptr(const rs_signature_t *sig,
103  int block_idx)
104 {
105  return (rs_block_sig_t *)((char *)sig->block_sigs +
106  block_idx * rs_block_sig_size(sig));
107 }
108 
109 /* Get the index of a block from a block_sig_t pointer. */
110 static inline int rs_block_sig_idx(const rs_signature_t *sig,
111  rs_block_sig_t *block_sig)
112 {
113  return ((char *)block_sig -
114  (char *)sig->block_sigs) / rs_block_sig_size(sig);
115 }
116 
117 rs_result rs_sig_args(rs_long_t old_fsize, rs_magic_number * magic,
118  size_t *block_len, size_t *strong_len)
119 {
120  size_t rec_block_len; /* the recomended block_len for the given
121  old_fsize. */
122  size_t min_strong_len; /* the minimum strong_len for the given
123  old_fsize and block_len. */
124  size_t max_strong_len; /* the maximum strong_len for the given magic. */
125 
126  /* Check and set default arguments. */
127  *magic = *magic ? *magic : RS_RK_BLAKE2_SIG_MAGIC;
128  switch (*magic) {
129  case RS_BLAKE2_SIG_MAGIC:
131  max_strong_len = RS_BLAKE2_SUM_LENGTH;
132  break;
133  case RS_MD4_SIG_MAGIC:
134  case RS_RK_MD4_SIG_MAGIC:
135  max_strong_len = RS_MD4_SUM_LENGTH;
136  break;
137  default:
138  rs_error("invalid magic %#x", *magic);
139  return RS_BAD_MAGIC;
140  }
141  /* The recommended block_len is sqrt(old_fsize) with a 256 min size to give
142  a reasonable compromise between signature size, delta size, and
143  performance. If the old_fsize is unknown, we use the default. */
144  if (old_fsize < 0) {
145  rec_block_len = RS_DEFAULT_BLOCK_LEN;
146  } else {
147  rec_block_len = old_fsize <= 256 * 256 ? 256 : rs_long_sqrt(old_fsize);
148  }
149  if (*block_len == 0)
150  *block_len = rec_block_len;
151  /* The recommended strong_len assumes the worst case new_fsize = old_fsize
152  + 16MB with no matches. This results in comparing a block at every byte
153  offset against all the blocks in the signature, or new_fsize*block_num
154  comparisons. With N bits in the blocksig, there is a 1/2^N chance per
155  comparison of a hash colision. So with 2^N attempts there would be a
156  fair chance of having a collision. So we want to round up to the next
157  byte, add an extra 2 bytes (16 bits) in the strongsum, and assume the
158  weaksum is worth another 16 bits, for at least 32 bits extra, giving a
159  worst case 1/2^32 chance of having a hash collision per delta. If
160  old_fsize is unknown we use a conservative default. */
161  if (old_fsize < 0) {
162  min_strong_len = RS_DEFAULT_MIN_STRONG_LEN;
163  } else {
164  min_strong_len =
165  2 + (rs_long_ln2(old_fsize + ((rs_long_t)1 << 24)) +
166  rs_long_ln2(old_fsize / *block_len + 1) + 7) / 8;
167  }
168  if (*strong_len == 0)
169  *strong_len = max_strong_len;
170  else if (*strong_len == -1)
171  *strong_len = min_strong_len;
172  else if (old_fsize >= 0 && *strong_len < min_strong_len) {
173  rs_log(RS_LOG_WARNING,
174  "strong_len=" FMT_SIZE " smaller than recommended minimum "
175  FMT_SIZE " for old_fsize=" FMT_LONG " with block_len=" FMT_SIZE,
176  *strong_len, min_strong_len, old_fsize, *block_len);
177  } else if (*strong_len > max_strong_len) {
178  rs_error("invalid strong_len=" FMT_SIZE " for magic=%#x", *strong_len,
179  (int)*magic);
180  return RS_PARAM_ERROR;
181  }
182  rs_sig_args_check(*magic, *block_len, *strong_len);
183  return RS_DONE;
184 }
185 
186 rs_result rs_signature_init(rs_signature_t *sig, rs_magic_number magic,
187  size_t block_len, size_t strong_len,
188  rs_long_t sig_fsize)
189 {
190  rs_result result;
191 
192  /* Check and set default arguments, using old_fsize=-1 for unknown. */
193  if ((result = rs_sig_args(-1, &magic, &block_len, &strong_len)) != RS_DONE)
194  return result;
195  /* Set attributes from args. */
196  sig->magic = magic;
197  sig->block_len = block_len;
198  sig->strong_sum_len = strong_len;
199  sig->count = 0;
200  /* Calculate the number of blocks if we have the signature file size. */
201  /* Magic+header is 12 bytes, each block thereafter is 4 bytes
202  weak_sum+strong_sum_len bytes */
203  sig->size = (int)(sig_fsize < 12 ? 0 : (sig_fsize - 12) / (4 + strong_len));
204  if (sig->size)
205  sig->block_sigs =
206  rs_alloc(sig->size * rs_block_sig_size(sig),
207  "signature->block_sigs");
208  else
209  sig->block_sigs = NULL;
210  sig->hashtable = NULL;
211 #ifndef HASHTABLE_NSTATS
212  sig->calc_strong_count = 0;
213 #endif
214  rs_signature_check(sig);
215  return RS_DONE;
216 }
217 
218 void rs_signature_done(rs_signature_t *sig)
219 {
220  hashtable_free(sig->hashtable);
221  free(sig->block_sigs);
222  rs_bzero(sig, sizeof(*sig));
223 }
224 
225 rs_block_sig_t *rs_signature_add_block(rs_signature_t *sig,
226  rs_weak_sum_t weak_sum,
227  rs_strong_sum_t *strong_sum)
228 {
229  rs_signature_check(sig);
230  /* Apply mix32() to rollsum weaksums to improve their distribution. */
231  if (rs_signature_weaksum_kind(sig) == RS_ROLLSUM)
232  weak_sum = mix32(weak_sum);
233  /* If block_sigs is full, allocate more space. */
234  if (sig->count == sig->size) {
235  sig->size = sig->size ? sig->size * 2 : 16;
236  sig->block_sigs =
237  rs_realloc(sig->block_sigs, sig->size * rs_block_sig_size(sig),
238  "signature->block_sigs");
239  }
240  rs_block_sig_t *b = rs_block_sig_ptr(sig, sig->count++);
241  rs_block_sig_init(b, weak_sum, strong_sum, sig->strong_sum_len);
242  return b;
243 }
244 
245 rs_long_t rs_signature_find_match(rs_signature_t *sig, rs_weak_sum_t weak_sum,
246  void const *buf, size_t len)
247 {
249  rs_block_sig_t *b;
250 
251  rs_signature_check(sig);
252  rs_block_match_init(&m, sig, weak_sum, NULL, buf, len);
253  if ((b = hashtable_find(sig->hashtable, &m))) {
254  return (rs_long_t)rs_block_sig_idx(sig, b) * sig->block_len;
255  }
256  return -1;
257 }
258 
260 {
261 #ifndef HASHTABLE_NSTATS
262  hashtable_t *t = sig->hashtable;
263 
264  rs_log(RS_LOG_INFO | RS_LOG_NONAME,
265  "match statistics: signature[%ld searches, %ld (%.3f%%) matches, "
266  "%ld (%.3fx) weak sum compares, %ld (%.3f%%) strong sum compares, "
267  "%ld (%.3f%%) strong sum calcs]", t->find_count, t->match_count,
268  100.0 * (double)t->match_count / t->find_count, t->hashcmp_count,
269  (double)t->hashcmp_count / t->find_count, t->entrycmp_count,
270  100.0 * (double)t->entrycmp_count / t->find_count,
271  sig->calc_strong_count,
272  100.0 * (double)sig->calc_strong_count / t->find_count);
273 #endif
274 }
275 
277 {
279  rs_block_sig_t *b;
280  int i;
281 
282  rs_signature_check(sig);
283  sig->hashtable = hashtable_new(sig->count);
284  if (!sig->hashtable)
285  return RS_MEM_ERROR;
286  for (i = 0; i < sig->count; i++) {
287  b = rs_block_sig_ptr(sig, i);
288  rs_block_match_init(&m, sig, b->weak_sum, &b->strong_sum, NULL, 0);
289  if (!hashtable_find(sig->hashtable, &m))
290  hashtable_add(sig->hashtable, b);
291  }
292  hashtable_stats_init(sig->hashtable);
293  return RS_DONE;
294 }
295 
297 {
298  rs_signature_done(psums);
299  free(psums);
300 }
301 
303 {
304  int i;
305  rs_block_sig_t *b;
306  char strong_hex[RS_MAX_STRONG_SUM_LENGTH * 3];
307 
308  rs_log(RS_LOG_INFO | RS_LOG_NONAME,
309  "sumset info: magic=%#x, block_len=%d, block_num=%d", sums->magic,
310  sums->block_len, sums->count);
311 
312  for (i = 0; i < sums->count; i++) {
313  b = rs_block_sig_ptr(sums, i);
314  rs_hexify(strong_hex, b->strong_sum, sums->strong_sum_len);
315  rs_log(RS_LOG_INFO | RS_LOG_NONAME,
316  "sum %6d: weak=" FMT_WEAKSUM ", strong=%s", i, b->weak_sum,
317  strong_hex);
318  }
319 }
int size
Total number of blocks allocated.
Definition: sumset.h:42
hashtable_t * hashtable
The hashtable for finding matches.
Definition: sumset.h:44
logging functions.
int count
Total number of blocks.
Definition: sumset.h:41
long hashcmp_count
The count of hash compares done.
Definition: hashtable.h:135
int block_len
The block length.
Definition: sumset.h:39
A signature file using the BLAKE2 hash.
Definition: librsync.h:89
Bad value passed in to library, probably an application bug.
Definition: librsync.h:200
Warning conditions.
Definition: librsync.h:123
Out of memory.
Definition: librsync.h:189
long match_count
The count of matches found.
Definition: hashtable.h:134
Bad magic number at start of stream.
Definition: librsync.h:193
LIBRSYNC_EXPORT void rs_sumset_dump(rs_signature_t const *)
Dump signatures to the log.
Definition: sumset.c:302
static unsigned mix32(unsigned int h)
MurmurHash3 finalization mix function.
Definition: hashtable.h:147
rs_weak_sum_t weak_sum
Block's weak checksum.
Definition: sumset.h:29
int strong_sum_len
The block strong sum length.
Definition: sumset.h:40
A signature file with MD4 signatures.
Definition: librsync.h:82
long entrycmp_count
The count of entry compares done.
Definition: hashtable.h:136
void * block_sigs
The packed block_sigs for all blocks.
Definition: sumset.h:43
Public header for librsync.
Signature of a whole file.
Definition: sumset.h:37
LIBRSYNC_EXPORT void rs_free_sumset(rs_signature_t *)
Deep deallocation of checksums.
Definition: sumset.c:296
Don't show function name in message.
Definition: trace.h:82
Informational.
Definition: librsync.h:125
LIBRSYNC_EXPORT rs_result rs_build_hash_table(rs_signature_t *sums)
Call this after loading a signature to index it.
Definition: sumset.c:276
rs_result
Return codes from nonblocking rsync operations.
Definition: librsync.h:180
A signature file with RabinKarp rollsum and MD4 hash.
Definition: librsync.h:99
int magic
The signature magic value.
Definition: sumset.h:38
#define RS_DEFAULT_BLOCK_LEN
Default block length, if not determined by any other factors.
Definition: librsync.h:361
long calc_strong_count
The count of strongsum calcs done.
Definition: sumset.h:47
Signature of a single block.
Definition: sumset.h:28
rs_magic_number
A uint32 magic number, emitted in bigendian/network order at the start of librsync files.
Definition: librsync.h:65
#define RS_DEFAULT_MIN_STRONG_LEN
Default minimum strong sum length, if the filesize is unknown.
Definition: librsync.h:367
long find_count
The count of finds tried.
Definition: hashtable.h:133
LIBRSYNC_EXPORT void rs_signature_log_stats(rs_signature_t const *sig)
Log the rs_signature_delta match stats.
Definition: sumset.c:259
LIBRSYNC_EXPORT rs_result rs_sig_args(rs_long_t old_fsize, rs_magic_number *magic, size_t *block_len, size_t *strong_len)
Get or check signature arguments for a given file size.
Definition: sumset.c:117
The hashtable type.
Definition: hashtable.h:128
Completed successfully.
Definition: librsync.h:181
A generic open addressing hashtable.
rs_strong_sum_t strong_sum
Block's strong checksum.
Definition: sumset.h:30
A signature file with RabinKarp rollsum and BLAKE2 hash.
Definition: librsync.h:109
LIBRSYNC_EXPORT void rs_hexify(char *to_buf, void const *from_buf, int from_len)
Convert from_len bytes at from_buf into a hex representation in to_buf, which must be twice as long p...
Definition: hex.c:23