Skip to content

Commit cbc7285

Browse files
committed
DedupeTool: Print stats, add --dry-run and --quiet options
Dry-run mode calculates and prints deduplication statistics without writing output files. Quiet mode suppresses statistics output.
1 parent 19a7604 commit cbc7285

File tree

2 files changed

+126
-11
lines changed

2 files changed

+126
-11
lines changed

src/org/netpreserve/jwarc/tools/DedupeTool.java

Lines changed: 111 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
import java.net.URI;
1414
import java.net.URL;
1515
import java.net.URLEncoder;
16+
import java.nio.ByteBuffer;
1617
import java.nio.channels.FileChannel;
18+
import java.nio.channels.WritableByteChannel;
1719
import java.nio.file.Path;
1820
import java.nio.file.Paths;
1921
import java.time.Instant;
@@ -29,6 +31,8 @@ public class DedupeTool {
2931
private long minimumSize = 256;
3032
private String cdxServer;
3133
private boolean verbose;
34+
private boolean dryRun;
35+
private boolean quiet;
3236
private LruCache<WarcDigest, CacheValue> digestCache;
3337

3438
private static class LruCache<K, V> extends LinkedHashMap<K, V> {
@@ -57,14 +61,43 @@ private CacheValue(URI id, String targetUri, Instant date) {
5761
}
5862
}
5963

64+
/**
65+
* A WritableByteChannel that discards everything written.
66+
*/
67+
private static class NullWritableByteChannel implements WritableByteChannel {
68+
private boolean open = true;
69+
70+
@Override
71+
public int write(ByteBuffer src) {
72+
int remaining = src.remaining();
73+
src.position(src.limit()); // consume all bytes
74+
return remaining;
75+
}
76+
77+
@Override
78+
public boolean isOpen() {
79+
return open;
80+
}
81+
82+
@Override
83+
public void close() {
84+
open = false;
85+
}
86+
}
87+
6088
public void deduplicateWarcFile(Path infile, Path outfile) throws IOException {
89+
long totalRecords = 0;
90+
long deduplicatedRecords = 0;
91+
long totalSize = 0;
92+
long savedSize = 0;
93+
94+
// We create the WarcWriter on demand so that if no records are deduplicated we don't write an empty
95+
// gzip member at the end of the file.
96+
WarcWriter writer = null;
97+
6198
try (FileChannel input = FileChannel.open(infile);
6299
WarcReader reader = new WarcReader(input);
63-
FileChannel output = FileChannel.open(outfile, WRITE, CREATE, TRUNCATE_EXISTING)) {
64-
65-
// We create the WarcWriter on demand so that if no records are deduplicated we don't write an empty
66-
// gzip member at the end of the file.
67-
WarcWriter writer = null;
100+
FileChannel output = dryRun ? null : FileChannel.open(outfile, WRITE, CREATE, TRUNCATE_EXISTING)) {
68101

69102
WarcRecord record = reader.next().orElse(null);
70103
while (record != null) {
@@ -74,18 +107,59 @@ public void deduplicateWarcFile(Path infile, Path outfile) throws IOException {
74107
record = reader.next().orElse(null);
75108
long length = reader.position() - position;
76109

110+
totalRecords++;
111+
totalSize += length;
112+
77113
if (revisit == null) {
78-
if (verbose) System.out.println("Copying " + position + ":" + length);
79-
transferExactly(input, position, length, output);
114+
if (verbose) {
115+
System.out.println((dryRun ? "Would copy " : "Copying") + position + ":" + length);
116+
}
117+
if (!dryRun) {
118+
transferExactly(input, position, length, output);
119+
}
80120
} else {
81-
if (verbose) System.out.println("Writing revisit for " + position + ":" + length);
82-
if (writer == null) writer = new WarcWriter(output, reader.compression());
121+
if (verbose) {
122+
System.out.println((dryRun ? "Would write" : "Writing") + " revisit for " + position + ":" + length);
123+
}
124+
deduplicatedRecords++;
125+
126+
if (writer == null) {
127+
if (dryRun) {
128+
writer = new WarcWriter(new NullWritableByteChannel(), reader.compression());
129+
} else {
130+
writer = new WarcWriter(output, reader.compression());
131+
}
132+
}
133+
long beforePosition = writer.position();
83134
writer.write(revisit);
135+
long revisitSize = writer.position() - beforePosition;
136+
137+
savedSize += (length - revisitSize);
84138
}
85139
}
140+
} finally {
141+
if (writer != null) writer.close();
142+
}
143+
144+
// Print statistics unless quiet mode is enabled
145+
if (!quiet) {
146+
double percentage = totalSize > 0 ? (double) savedSize / totalSize * 100 : 0.0;
147+
String action = dryRun ? "would dedupe" : "deduped";
148+
System.out.printf("%s: %s %d/%d records, saving %s/%s (%.2f%%)%n",
149+
outfile != null ? outfile.getFileName() : infile.getFileName(), action, deduplicatedRecords,
150+
totalRecords, formatBytes(savedSize), formatBytes(totalSize), percentage);
86151
}
87152
}
88153

154+
155+
156+
private String formatBytes(long bytes) {
157+
if (bytes < 1024) return bytes + "B";
158+
if (bytes < 1024 * 1024) return String.format("%.2fKB", bytes / 1024.0);
159+
if (bytes < 1024 * 1024 * 1024) return String.format("%.2fMB", bytes / (1024.0 * 1024.0));
160+
return String.format("%.2f GB", bytes / (1024.0 * 1024.0 * 1024.0));
161+
}
162+
89163
private static void transferExactly(FileChannel input, long position, long length, FileChannel output) throws IOException {
90164
long transferred = 0;
91165
while (transferred < length) {
@@ -198,11 +272,21 @@ public static void main(String[] args) throws IOException {
198272
System.out.println(" --cache-size N Cache N digests for de-duplication (enables cross-URI de-duplication)");
199273
System.out.println(" --cdx-server URL De-deduplicate against a remote CDX server");
200274
System.out.println(" --minimum-size BYTES Minimum payload size to consider de-duplicating (default " + dedupeTool.minimumSize + ")");
275+
System.out.println(" -n, --dry-run Don't write output, just calculate and print deduplication statistics");
276+
System.out.println(" -q, --quiet Don't print deduplication statistics");
201277
System.out.println(" -v, --verbose Verbose output");
202278
return;
203279
case "-v":
204280
case "--verbose":
205-
dedupeTool.verbose = true;
281+
dedupeTool.setVerbose(true);
282+
break;
283+
case "-n":
284+
case "--dry-run":
285+
dedupeTool.setDryRun(true);
286+
break;
287+
case "-q":
288+
case "--quiet":
289+
dedupeTool.setQuiet(true);
206290
break;
207291
default:
208292
System.err.println("Unrecognized option: " + args[i]);
@@ -216,7 +300,15 @@ public static void main(String[] args) throws IOException {
216300
}
217301

218302
for (Path infile : infiles) {
219-
dedupeTool.deduplicateWarcFile(infile, determineOutputPath(infile));
303+
try {
304+
Path outfile = dedupeTool.dryRun ? null : determineOutputPath(infile);
305+
dedupeTool.deduplicateWarcFile(infile, outfile);
306+
} catch (IOException e) {
307+
System.err.println("Failed to deduplicate " + infile + ": " + e.getMessage());
308+
if (!dedupeTool.quiet) e.printStackTrace(System.err);
309+
System.exit(1);
310+
return;
311+
}
220312
}
221313
}
222314

@@ -231,4 +323,12 @@ public void setMinimumSize(long minimumSize) {
231323
public void setVerbose(boolean verbose) {
232324
this.verbose = verbose;
233325
}
326+
327+
public void setDryRun(boolean dryRun) {
328+
this.dryRun = dryRun;
329+
}
330+
331+
public void setQuiet(boolean quiet) {
332+
this.quiet = quiet;
333+
}
234334
}

test/org/netpreserve/jwarc/tools/DedupeToolTest.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,21 @@ public void testMinimumSizeThreshold() throws Exception {
193193
assertEquals("Should have 0 revisit records", 0, revisitCount);
194194
}
195195

196+
@Test
197+
public void testDryRun() throws Exception {
198+
// Set cache size to enable deduplication
199+
dedupeTool.setCacheSize(10);
200+
201+
// Set minimum size to a small value to allow deduplication of our test payloads
202+
dedupeTool.setMinimumSize(10);
203+
204+
// Enable dry run mode
205+
dedupeTool.setDryRun(true);
206+
207+
// Run dry run deduplication - this should not create any output file
208+
dedupeTool.deduplicateWarcFile(testWarcFile, null);
209+
}
210+
196211
@Test
197212
public void testDetermineOutputPath() {
198213
// Test with various file extensions

0 commit comments

Comments
 (0)