allow setting the first/last bytes reading size

Paul Dreik · Paul Dreik · commit 94e87ccb7e12 · 2026-02-15T20:49:03.000+01:00
diff --git a/Options.cc b/Options.cc
@@ -30,6 +30,12 @@ options are (default choice within parentheses)
 
  Processing options:
 
+ -firstbytessize N                sets the size in bytes when comparing the
+                                  beginning of files, prior to full checksumming.
+                                  default is 64 byte. Use 0 to disable the stage.
+ -lastbytessize N                 sets the size in bytes when comparing the
+                                  end of files, prior to full checksumming.
+                                  default is 64 byte. Use 0 to disable the stage.
  -checksum          none | md5 |(sha1)| sha256 | sha512 | xxh128
                                   checksum type
                                   xxh128 is very fast, but is noncryptographic.
@@ -128,6 +134,19 @@ parseOptions(Parser& parser)
       o.remove_identical_inode = parser.get_parsed_bool();
     } else if (parser.try_parse_bool("-deterministic")) {
       o.deterministic = parser.get_parsed_bool();
+    } else if (parser.try_parse_string("-firstbytessize")) {
+      const auto tmp = std::stoll(parser.get_parsed_string());
+      if (tmp < 0) {
+        throw std::runtime_error(
+          "negative value of firstbytessize not allowed");
+      }
+      o.first_bytes_size = tmp;
+    } else if (parser.try_parse_string("-lastbytessize")) {
+      const auto tmp = std::stoll(parser.get_parsed_string());
+      if (tmp < 0) {
+        throw std::runtime_error("negative value of lastbytessize not allowed");
+      }
+      o.last_bytes_size = tmp;
     } else if (parser.try_parse_string("-checksum")) {
       if (parser.parsed_string_is("md5")) {
         o.usemd5 = true;
diff --git a/README.md b/README.md
@@ -86,9 +86,9 @@ Rdfind uses the following algorithm. If N is the number of files to search throu
 5. If flag -removeidentinode true: Remove items from the list which already are added, based on the combination of inode and device number. A group of files that are hardlinked to the same file are collapsed to one entry. Also see the comment on hardlinks under ”caveats below”!
 6. Sort files on size. Remove files from the list, which have unique sizes.
 7. Sort on device and inode(speeds up file reading). Read a few bytes from the beginning of each file (first bytes).
-8. Remove files from list that have the same size but different first bytes.
+8. Remove files from list that have the same size but different first bytes. (This step is possible to disable by using -firstbytessize 0).
 9. Sort on device and inode(speeds up file reading). Read a few bytes from the end of each file (last bytes).
-10. Remove files from list that have the same size but different last bytes.
+10. Remove files from list that have the same size but different last bytes. (This step is possible to disable by using -lastbytessize 0).
 11. Sort on device and inode(speeds up file reading). Perform a checksum calculation for each file (unless disabled with -checksum none).
 12. Only keep files on the list with the same size and checksum. These are duplicates.
 13. Sort list on size, priority number, and depth. The first file for every set of duplicates is considered to be the original.
diff --git a/rdfind.1 b/rdfind.1
@@ -91,6 +91,14 @@ for files, smaller or bigger can improve performance
 dependent on filesystem and checksum algorithm.
 The default is 1 MiB, the maximum allowed is 128MiB (inclusive).
 .TP
+.BR \-firstbytessize " " \fIN\fR
+Size in bytes when scanning the first bytes of each file, prior to full
+checksumming. Setting this to 0 means skipping the step entirely.
+.TP
+.BR \-lastbytessize " " \fIN\fR
+Size in bytes when scanning the last bytes of each file, prior to full
+checksumming. Setting this to 0 means skipping the step entirely.
+.TP
 .BR \-deterministic " " \fItrue\fR|\fIfalse\fR
 If set (the default), sort files of equal rank in an unspecified but
 deterministic order. This makes the behaviour independent of in which
diff --git a/rdfind.cc b/rdfind.cc
@@ -146,9 +146,15 @@ main(int narg, const char* argv[])
   // candidates. start looking at the contents.
   std::vector<std::pair<Fileinfo::readtobuffermode, const char*>> modes{
     { Fileinfo::readtobuffermode::NOT_DEFINED, "" },
-    { Fileinfo::readtobuffermode::READ_FIRST_BYTES, "first bytes" },
-    { Fileinfo::readtobuffermode::READ_LAST_BYTES, "last bytes" },
   };
+  if (o.first_bytes_size > 0) {
+    modes.emplace_back(Fileinfo::readtobuffermode::READ_FIRST_BYTES,
+                       "first bytes");
+  }
+  if (o.last_bytes_size > 0) {
+    modes.emplace_back(Fileinfo::readtobuffermode::READ_LAST_BYTES,
+                       "last bytes");
+  }
   if (o.usemd5) {
     modes.emplace_back(Fileinfo::readtobuffermode::CREATE_MD5_CHECKSUM,
                        "md5 checksum");
diff --git a/testcases/verify_skipfirstbytes.sh b/testcases/verify_skipfirstbytes.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+# Ensures the skip first bytes step checks
+#
+
+set -e
+. "$(dirname "$0")/common_funcs.sh"
+
+FIRSTBYTES=1000
+MIDDLEBYTES=1000
+LASTBYTES=1000
+
+# make a file which is longer than "first bytes" and "last bytes" together,
+# so we can make two files that differ only in the middle and will
+# need checksumming to see they are different.
+makefiles() {
+  for f in a b; do
+    (
+      head -c$FIRSTBYTES </dev/zero
+      head -c$MIDDLEBYTES </dev/urandom
+      head -c$LASTBYTES </dev/zero
+    ) >$f
+  done
+}
+
+reset_teststate
+makefiles
+
+defaultfirst="-firstbytessize 64"
+defaultlast="-lastbytessize 64"
+
+# with no checksum, we should falsely believe the files are equal
+# shellcheck disable=SC2086
+$rdfind -checksum none $defaultfirst $defaultlast a* b* \
+  | grep "files that are not unique" >output.log
+verify [ "$(cat output.log)" = "It seems like you have 2 files that are not unique" ]
+
+# if we set the first bytes size to be very large, we will detect it
+# shellcheck disable=SC2086
+$rdfind -checksum none -firstbytessize $((FIRSTBYTES + MIDDLEBYTES)) $defaultlast a* b* \
+  | grep "files that are not unique" >output.log
+verify [ "$(cat output.log)" = "It seems like you have 0 files that are not unique" ]
+
+# if we set the last bytes size to be very large, we will also detect it
+# shellcheck disable=SC2086
+$rdfind -checksum none $defaultfirst -lastbytessize $((MIDDLEBYTES + LASTBYTES)) a* b* \
+  | grep "files that are not unique" >output.log
+verify [ "$(cat output.log)" = "It seems like you have 0 files that are not unique" ]
+
+dbgecho "all is good for the skip first bytes step check!"