Skip to content

Commit 89106c0

Browse files
committed
Add duplicates report
Prints a list of duplicated files. Enabled by -R duplicates. Signed-off-by: John Pennycook <[email protected]>
1 parent 67ba040 commit 89106c0

File tree

2 files changed

+34
-1
lines changed

2 files changed

+34
-1
lines changed

codebasin/__main__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def _main():
190190
metavar="<report>",
191191
action="append",
192192
default=[],
193-
choices=["all", "summary", "clustering"],
193+
choices=["all", "summary", "clustering", "duplicates"],
194194
help=_help_string(
195195
"Generate a report of the specified type.",
196196
"May be specified multiple times.",
@@ -353,6 +353,10 @@ def report_enabled(name):
353353
if clustering is not None:
354354
print(clustering)
355355

356+
# Print duplicates report
357+
if report_enabled("duplicates"):
358+
report.duplicates(codebase)
359+
356360
sys.exit(0)
357361

358362

codebasin/report.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
import hashlib
99
import itertools as it
1010
import logging
11+
import sys
1112
import warnings
1213
from collections import defaultdict
1314
from pathlib import Path
15+
from typing import TextIO
1416

1517
from codebasin import CodeBase, util
1618

@@ -289,3 +291,30 @@ def find_duplicates(codebase: CodeBase) -> list[set[Path]]:
289291
confirmed_matches.append(matches)
290292

291293
return confirmed_matches
294+
295+
296+
def duplicates(codebase: CodeBase, stream: TextIO = sys.stdout):
297+
"""
298+
Produce a report identifying sets of duplicate files.
299+
300+
Parameters
301+
----------
302+
codebase: CodeBase
303+
The code base to search for duplicates.
304+
305+
stream: TextIO, default: sys.stdout
306+
The stream to write the report to.
307+
"""
308+
confirmed_matches = find_duplicates(codebase)
309+
310+
print("Duplicates", file=stream)
311+
print("----------", file=stream)
312+
313+
if len(confirmed_matches) == 0:
314+
print("No duplicates found.", file=stream)
315+
return
316+
317+
for i, matches in enumerate(confirmed_matches):
318+
print(f"Match {i}:", file=stream)
319+
for path in matches:
320+
print(f"- {path}")

0 commit comments

Comments
 (0)