Skip to content

Commit 3571bac

Browse files
pauldreikPaul Dreik
authored andcommitted
Optionally disable checksumming (issue #118) (#207)
2 parents 33510fd + 69689da commit 3571bac

File tree

7 files changed

+61
-7
lines changed

7 files changed

+61
-7
lines changed

Makefile.am

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ TESTS=testcases/largefilesupport.sh \
2020
testcases/checksum_options.sh \
2121
testcases/md5collisions.sh \
2222
testcases/sha1collisions.sh \
23-
testcases/checksum_buffersize.sh
23+
testcases/checksum_buffersize.sh \
24+
testcases/verify_nochecksum.sh
25+
2426

2527
AUXFILES=testcases/common_funcs.sh \
2628
testcases/md5collisions/letter_of_rec.ps \

NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
next
2+
optionally disable the checksum step by giving -checksum none
3+
optionally show progress
14
1.7.0
25
requires a C++17 capable compiler.
36
new fast non-cryptographic hash xxh

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ Rdfind uses the following algorithm. If N is the number of files to search throu
8989
8. Remove files from list that have the same size but different first bytes.
9090
9. Sort on device and inode(speeds up file reading). Read a few bytes from the end of each file (last bytes).
9191
10. Remove files from list that have the same size but different last bytes.
92-
11. Sort on device and inode(speeds up file reading). Perform a checksum calculation for each file.
92+
11. Sort on device and inode(speeds up file reading). Perform a checksum calculation for each file (unless disabled with -checksum none).
9393
12. Only keep files on the list with the same size and checksum. These are duplicates.
9494
13. Sort list on size, priority number, and depth. The first file for every set of duplicates is considered to be the original.
9595
14. If flag ”-makeresultsfile true”, then print results file (default).

inofficial_cmake/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ set(testscripts
6666
testcases/verify_dryrun_option.sh
6767
testcases/verify_filesize_option.sh
6868
testcases/verify_maxfilesize_option.sh
69+
testcases/verify_nochecksum.sh
6970
testcases/verify_ranking.sh
7071
testcases/verify_size_savings.sh)
7172

rdfind.1

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,14 @@ Follow symlinks. Default is false.
7676
Removes items found which have identical inode and device ID. Default
7777
is true.
7878
.TP
79-
.BR \-checksum " " \fImd5\fR|\fIsha1\fR|\fIsha256\fR|\fIsha512|\fIxxh128\fR
79+
.BR \-checksum " " \fInone\fR|\fImd5\fR|\fIsha1\fR|\fIsha256\fR|\fIsha512|\fIxxh128\fR
8080
What type of checksum to be used: md5, sha1, sha256, sha512 or xxh128. The default is
8181
sha1 since version 1.4.0. xxh128 is a very fast checksum, but not of cryptographic
8282
quality. xxh support is optional and requires that rdfind was configured with
8383
--with-xxhash. In case xxh is used but there is no support, an error is returned.
84+
Checksum none can be used to skip checksumming altogether. \fBThis is not recommended!\fR
85+
In case files of the same size have contents that differ it is likely they are falsely
86+
consider duplicates, leading to file removal (depending on other options).
8487
.TP
8588
.BR \-buffersize " " \fIN\fR
8689
Chunksize in bytes when calculating the checksum

rdfind.cc

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ usage()
6464
<< " -followsymlinks true |(false) follow symlinks\n"
6565
<< " -removeidentinode (true)| false ignore files with nonunique "
6666
"device and inode\n"
67-
<< " -checksum md5 |(sha1)| sha256 | sha512 | xxh128\n"
67+
<< " -checksum none | md5 |(sha1)| sha256 | sha512 | xxh128\n"
6868
<< indent << "checksum type\n"
6969
<< indent << "xxh128 is very fast, but is noncryptographic.\n"
7070
<< " -buffersize N\n"
@@ -116,6 +116,7 @@ struct Options
116116
bool usesha256 = false; // use sha256 checksum to check for similarity
117117
bool usesha512 = false; // use sha512 checksum to check for similarity
118118
bool usexxh128 = false; // use xxh128 checksum to check for similarity
119+
bool nochecksum = false; // skip using checksumming (unsafe!)
119120
bool deterministic = true; // be independent of filesystem order
120121
bool showprogress = false; // show progress while reading file contents
121122
std::size_t buffersize = 1 << 20; // chunksize to use when reading files
@@ -196,8 +197,12 @@ parseOptions(Parser& parser)
196197
"reconfigure and rebuild '--with-xxhash'\n";
197198
std::exit(EXIT_FAILURE);
198199
#endif
200+
} else if (parser.parsed_string_is("none")) {
201+
std::cout
202+
<< "DANGER! -checksum none given, will skip the checksumming stage\n";
203+
o.nochecksum = true;
199204
} else {
200-
std::cerr << "expected md5/sha1/sha256/sha512/xxh128, not \""
205+
std::cerr << "expected none/md5/sha1/sha256/sha512/xxh128, not \""
201206
<< parser.get_parsed_string() << "\"\n";
202207
std::exit(EXIT_FAILURE);
203208
}
@@ -273,8 +278,9 @@ parseOptions(Parser& parser)
273278

274279
// done with parsing of options. remaining arguments are files and dirs.
275280

276-
// decide what checksum to use - if no checksum is set, force sha1!
277-
if (!o.usemd5 && !o.usesha1 && !o.usesha256 && !o.usesha512 && !o.usexxh128) {
281+
// decide what checksum to use, default to sha1
282+
if (!o.usemd5 && !o.usesha1 && !o.usesha256 && !o.usesha512 && !o.usexxh128 &&
283+
!o.nochecksum) {
278284
o.usesha1 = true;
279285
}
280286
return o;

testcases/verify_nochecksum.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/sh
2+
# Ensures the skip checksumming stage works as intended.
3+
#
4+
5+
set -e
6+
. "$(dirname "$0")/common_funcs.sh"
7+
8+
# make a file which is longer than "first bytes" and "last bytes" together,
9+
# so we can make two files that differ only in the middle and will
10+
# need checksumming to see they are different.
11+
makefiles() {
12+
FIRSTBYTES=1000
13+
MIDDLEBYTES=1000
14+
LASTBYTES=1000
15+
for f in a b; do
16+
(
17+
head -c$FIRSTBYTES </dev/zero
18+
head -c$MIDDLEBYTES </dev/urandom
19+
head -c$LASTBYTES </dev/zero
20+
) >$f
21+
done
22+
}
23+
24+
reset_teststate
25+
makefiles
26+
27+
# with no checksum, we should falsely believe the files are equal
28+
$rdfind -checksum none a* b* \
29+
| grep "files that are not unique" >output.log
30+
31+
verify [ "$(cat output.log)" = "It seems like you have 2 files that are not unique" ]
32+
33+
# with checksumming (the default) the files should not be considered equal.
34+
$rdfind -checksum sha1 a* b* \
35+
| grep "files that are not unique" >output.log
36+
37+
verify [ "$(cat output.log)" = "It seems like you have 0 files that are not unique" ]
38+
39+
dbgecho "all is good for the checksum=none test!"

0 commit comments

Comments
 (0)