1313import java .net .URI ;
1414import java .net .URL ;
1515import java .net .URLEncoder ;
16+ import java .nio .ByteBuffer ;
1617import java .nio .channels .FileChannel ;
18+ import java .nio .channels .WritableByteChannel ;
1719import java .nio .file .Path ;
1820import java .nio .file .Paths ;
1921import java .time .Instant ;
@@ -29,6 +31,8 @@ public class DedupeTool {
2931 private long minimumSize = 256 ;
3032 private String cdxServer ;
3133 private boolean verbose ;
34+ private boolean dryRun ;
35+ private boolean quiet ;
3236 private LruCache <WarcDigest , CacheValue > digestCache ;
3337
3438 private static class LruCache <K , V > extends LinkedHashMap <K , V > {
@@ -57,14 +61,43 @@ private CacheValue(URI id, String targetUri, Instant date) {
5761 }
5862 }
5963
64+ /**
65+ * A WritableByteChannel that discards everything written.
66+ */
67+ private static class NullWritableByteChannel implements WritableByteChannel {
68+ private boolean open = true ;
69+
70+ @ Override
71+ public int write (ByteBuffer src ) {
72+ int remaining = src .remaining ();
73+ src .position (src .limit ()); // consume all bytes
74+ return remaining ;
75+ }
76+
77+ @ Override
78+ public boolean isOpen () {
79+ return open ;
80+ }
81+
82+ @ Override
83+ public void close () {
84+ open = false ;
85+ }
86+ }
87+
6088 public void deduplicateWarcFile (Path infile , Path outfile ) throws IOException {
89+ long totalRecords = 0 ;
90+ long deduplicatedRecords = 0 ;
91+ long totalSize = 0 ;
92+ long savedSize = 0 ;
93+
94+ // We create the WarcWriter on demand so that if no records are deduplicated we don't write an empty
95+ // gzip member at the end of the file.
96+ WarcWriter writer = null ;
97+
6198 try (FileChannel input = FileChannel .open (infile );
6299 WarcReader reader = new WarcReader (input );
63- FileChannel output = FileChannel .open (outfile , WRITE , CREATE , TRUNCATE_EXISTING )) {
64-
65- // We create the WarcWriter on demand so that if no records are deduplicated we don't write an empty
66- // gzip member at the end of the file.
67- WarcWriter writer = null ;
100+ FileChannel output = dryRun ? null : FileChannel .open (outfile , WRITE , CREATE , TRUNCATE_EXISTING )) {
68101
69102 WarcRecord record = reader .next ().orElse (null );
70103 while (record != null ) {
@@ -74,18 +107,59 @@ public void deduplicateWarcFile(Path infile, Path outfile) throws IOException {
74107 record = reader .next ().orElse (null );
75108 long length = reader .position () - position ;
76109
110+ totalRecords ++;
111+ totalSize += length ;
112+
77113 if (revisit == null ) {
78- if (verbose ) System .out .println ("Copying " + position + ":" + length );
79- transferExactly (input , position , length , output );
114+ if (verbose ) {
115+ System .out .println ((dryRun ? "Would copy " : "Copying" ) + position + ":" + length );
116+ }
117+ if (!dryRun ) {
118+ transferExactly (input , position , length , output );
119+ }
80120 } else {
81- if (verbose ) System .out .println ("Writing revisit for " + position + ":" + length );
82- if (writer == null ) writer = new WarcWriter (output , reader .compression ());
121+ if (verbose ) {
122+ System .out .println ((dryRun ? "Would write" : "Writing" ) + " revisit for " + position + ":" + length );
123+ }
124+ deduplicatedRecords ++;
125+
126+ if (writer == null ) {
127+ if (dryRun ) {
128+ writer = new WarcWriter (new NullWritableByteChannel (), reader .compression ());
129+ } else {
130+ writer = new WarcWriter (output , reader .compression ());
131+ }
132+ }
133+ long beforePosition = writer .position ();
83134 writer .write (revisit );
135+ long revisitSize = writer .position () - beforePosition ;
136+
137+ savedSize += (length - revisitSize );
84138 }
85139 }
140+ } finally {
141+ if (writer != null ) writer .close ();
142+ }
143+
144+ // Print statistics unless quiet mode is enabled
145+ if (!quiet ) {
146+ double percentage = totalSize > 0 ? (double ) savedSize / totalSize * 100 : 0.0 ;
147+ String action = dryRun ? "would dedupe" : "deduped" ;
148+ System .out .printf ("%s: %s %d/%d records, saving %s/%s (%.2f%%)%n" ,
149+ outfile != null ? outfile .getFileName () : infile .getFileName (), action , deduplicatedRecords ,
150+ totalRecords , formatBytes (savedSize ), formatBytes (totalSize ), percentage );
86151 }
87152 }
88153
154+
155+
156+ private String formatBytes (long bytes ) {
157+ if (bytes < 1024 ) return bytes + "B" ;
158+ if (bytes < 1024 * 1024 ) return String .format ("%.2fKB" , bytes / 1024.0 );
159+ if (bytes < 1024 * 1024 * 1024 ) return String .format ("%.2fMB" , bytes / (1024.0 * 1024.0 ));
160+ return String .format ("%.2f GB" , bytes / (1024.0 * 1024.0 * 1024.0 ));
161+ }
162+
89163 private static void transferExactly (FileChannel input , long position , long length , FileChannel output ) throws IOException {
90164 long transferred = 0 ;
91165 while (transferred < length ) {
@@ -198,11 +272,21 @@ public static void main(String[] args) throws IOException {
198272 System .out .println (" --cache-size N Cache N digests for de-duplication (enables cross-URI de-duplication)" );
199273 System .out .println (" --cdx-server URL De-deduplicate against a remote CDX server" );
200274 System .out .println (" --minimum-size BYTES Minimum payload size to consider de-duplicating (default " + dedupeTool .minimumSize + ")" );
275+ System .out .println (" -n, --dry-run Don't write output, just calculate and print deduplication statistics" );
276+ System .out .println (" -q, --quiet Don't print deduplication statistics" );
201277 System .out .println (" -v, --verbose Verbose output" );
202278 return ;
203279 case "-v" :
204280 case "--verbose" :
205- dedupeTool .verbose = true ;
281+ dedupeTool .setVerbose (true );
282+ break ;
283+ case "-n" :
284+ case "--dry-run" :
285+ dedupeTool .setDryRun (true );
286+ break ;
287+ case "-q" :
288+ case "--quiet" :
289+ dedupeTool .setQuiet (true );
206290 break ;
207291 default :
208292 System .err .println ("Unrecognized option: " + args [i ]);
@@ -216,7 +300,15 @@ public static void main(String[] args) throws IOException {
216300 }
217301
218302 for (Path infile : infiles ) {
219- dedupeTool .deduplicateWarcFile (infile , determineOutputPath (infile ));
303+ try {
304+ Path outfile = dedupeTool .dryRun ? null : determineOutputPath (infile );
305+ dedupeTool .deduplicateWarcFile (infile , outfile );
306+ } catch (IOException e ) {
307+ System .err .println ("Failed to deduplicate " + infile + ": " + e .getMessage ());
308+ if (!dedupeTool .quiet ) e .printStackTrace (System .err );
309+ System .exit (1 );
310+ return ;
311+ }
220312 }
221313 }
222314
@@ -231,4 +323,12 @@ public void setMinimumSize(long minimumSize) {
231323 public void setVerbose (boolean verbose ) {
232324 this .verbose = verbose ;
233325 }
326+
327+ public void setDryRun (boolean dryRun ) {
328+ this .dryRun = dryRun ;
329+ }
330+
331+ public void setQuiet (boolean quiet ) {
332+ this .quiet = quiet ;
333+ }
234334}
0 commit comments