-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathduplicate_finder.py
More file actions
327 lines (271 loc) · 11.9 KB
/
Copy pathduplicate_finder.py
File metadata and controls
327 lines (271 loc) · 11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
"""
Duplicate File Detector
=======================
Detects duplicate files by comparing their content using SHA-256 hashing.
Works with ALL file types: documents, images, videos, audio, PDFs, etc.
Keeps the file with the highest size and asks permission to delete duplicates.
Author : Auto-generated
Version : 1.0.0
Date : 2026-03-08
Usage:
python duplicate_finder.py
python duplicate_finder.py <directory_path>
"""
import os
import sys
import hashlib
import time
from collections import defaultdict
# ─────────────────────────────────────────────
# Constants
# ─────────────────────────────────────────────
CHUNK_SIZE = 8192 # Read files in 8 KB chunks for memory efficiency
SUPPORTED_EXTENSIONS = None # None = all file types supported
# ─────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────
def format_size(size_bytes):
"""Convert bytes to a human-readable string."""
for unit in ["B", "KB", "MB", "GB", "TB"]:
if size_bytes < 1024:
return f"{size_bytes:.2f} {unit}"
size_bytes /= 1024
return f"{size_bytes:.2f} PB"
def get_file_type(filepath):
"""Return a human-readable file type based on extension."""
ext = os.path.splitext(filepath)[1].lower()
type_map = {
# Documents
".pdf": "PDF", ".doc": "Document", ".docx": "Document",
".txt": "Text", ".rtf": "Rich Text", ".odt": "Document",
".xls": "Spreadsheet", ".xlsx": "Spreadsheet", ".csv": "CSV",
".ppt": "Presentation", ".pptx": "Presentation",
# Images
".jpg": "Image", ".jpeg": "Image", ".png": "Image",
".gif": "Image", ".bmp": "Image", ".svg": "Image",
".webp": "Image", ".ico": "Icon", ".tiff": "Image",
# Video
".mp4": "Video", ".avi": "Video", ".mkv": "Video",
".mov": "Video", ".wmv": "Video", ".flv": "Video", ".webm": "Video",
# Audio
".mp3": "Audio", ".wav": "Audio", ".flac": "Audio",
".aac": "Audio", ".ogg": "Audio", ".wma": "Audio",
# Archives
".zip": "Archive", ".rar": "Archive", ".7z": "Archive",
".tar": "Archive", ".gz": "Archive",
# Code
".py": "Python", ".js": "JavaScript", ".html": "HTML",
".css": "CSS", ".java": "Java", ".cpp": "C++",
}
return type_map.get(ext, ext.upper()[1:] if ext else "Unknown")
def get_file_hash(filepath):
"""
Compute the SHA-256 hash of a file's content.
Reads in chunks to handle very large files without running out of memory.
"""
sha256 = hashlib.sha256()
try:
with open(filepath, "rb") as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
sha256.update(chunk)
return sha256.hexdigest()
except (PermissionError, OSError) as e:
print(f" ⚠ Could not read: {filepath} ({e})")
return None
def collect_files(directory):
"""Recursively collect all files in a directory, skipping empty files."""
files = []
skipped_empty = 0
for root, _, filenames in os.walk(directory):
for name in filenames:
full_path = os.path.join(root, name)
try:
size = os.path.getsize(full_path)
if size == 0:
skipped_empty += 1
continue
files.append((full_path, size))
except OSError:
pass
if skipped_empty > 0:
print(f" (Skipped {skipped_empty} empty files)")
return files
def group_by_size(files):
"""
First-pass optimization: group files by size.
Files with unique sizes cannot be duplicates, so we skip them.
"""
size_map = defaultdict(list)
for path, size in files:
size_map[size].append(path)
# Only return groups with more than one file (potential duplicates)
return {size: paths for size, paths in size_map.items() if len(paths) > 1}
def find_duplicates(directory):
"""
Scan a directory and find duplicate files based on content hash.
Returns a dict: { hash: [list of file paths] }
"""
print(f"\n{'═' * 60}")
print(f" 📂 Scanning: {directory}")
print(f"{'═' * 60}\n")
# Step 1: Collect all files
print(f" [1/3] Collecting files...")
all_files = collect_files(directory)
print(f" Found {len(all_files)} files.")
# Show file type breakdown
if all_files:
type_counts = defaultdict(int)
for path, _ in all_files:
type_counts[get_file_type(path)] += 1
top_types = sorted(type_counts.items(), key=lambda x: x[1], reverse=True)[:5]
breakdown = ", ".join(f"{t}: {c}" for t, c in top_types)
print(f" Types: {breakdown}")
print()
if not all_files:
print(" ❌ No files found in the specified directory.")
return {}
# Step 2: Group by size (quick filter)
print(" [2/3] Grouping by file size (quick filter)...")
size_groups = group_by_size(all_files)
candidates = sum(len(paths) for paths in size_groups.values())
print(f" {candidates} files are potential duplicates (same size).\n")
# Step 3: Hash the candidate files
print(" [3/3] Computing content hashes...")
hash_map = defaultdict(list)
hashed_count = 0
for size, paths in size_groups.items():
for path in paths:
file_hash = get_file_hash(path)
if file_hash:
hash_map[file_hash].append(path)
hashed_count += 1
# Progress indicator
if hashed_count % 50 == 0 or hashed_count == candidates:
print(f" Hashed {hashed_count}/{candidates} files...", end="\r")
print(f" Hashed {hashed_count}/{candidates} files. ✔\n")
# Filter to only actual duplicates (hash appears more than once)
duplicates = {h: paths for h, paths in hash_map.items() if len(paths) > 1}
return duplicates
def display_and_handle_duplicates(duplicates):
"""
Display duplicate groups and let the user decide what to delete.
Keeps the file with the largest size in each group.
"""
if not duplicates:
print(" ✅ No duplicate files found! Your directory is clean.\n")
return
total_groups = len(duplicates)
total_dupes = sum(len(paths) - 1 for paths in duplicates.values())
total_recoverable = 0
print(f"{'═' * 60}")
print(f" 🔍 Found {total_groups} group(s) of duplicates ({total_dupes} extra files)")
print(f"{'═' * 60}\n")
deleted_count = 0
skipped_count = 0
freed_space = 0
for group_num, (file_hash, paths) in enumerate(duplicates.items(), 1):
# Get sizes for each file in the group
file_info = []
for p in paths:
try:
size = os.path.getsize(p)
file_info.append((p, size))
except OSError:
file_info.append((p, 0))
# Sort by size descending — keep the largest
file_info.sort(key=lambda x: x[1], reverse=True)
keeper = file_info[0]
to_delete = file_info[1:]
recoverable = sum(size for _, size in to_delete)
total_recoverable += recoverable
# Display group info
print(f" ┌── Group {group_num}/{total_groups} ──────────────────────────")
print(f" │ Hash: {file_hash[:16]}...")
print(f" │ Files: {len(file_info)} duplicates found")
print(f" │ Recoverable space: {format_size(recoverable)}")
print(f" │")
print(f" │ ✅ KEEP: {keeper[0]}")
print(f" │ Size: {format_size(keeper[1])} | Type: {get_file_type(keeper[0])}")
for i, (path, size) in enumerate(to_delete):
print(f" │")
print(f" │ 🗑️ DELETE candidate {i + 1}: {path}")
print(f" │ Size: {format_size(size)} | Type: {get_file_type(path)}")
print(f" └{'─' * 50}\n")
# Ask user for permission — one file at a time
for i, (path, size) in enumerate(to_delete):
print(f" ❓ Delete this duplicate?")
print(f" File : {path}")
print(f" Type : {get_file_type(path)}")
print(f" Size : {format_size(size)}")
while True:
choice = input(" ➤ (y = yes / n = skip / q = quit): ").strip().lower()
if choice == "y":
try:
os.remove(path)
print(f" 🗑️ Deleted!\n")
deleted_count += 1
freed_space += size
except OSError as e:
print(f" ⚠ Failed to delete: {e}\n")
break
elif choice == "n":
print(f" ⏭️ Skipped.\n")
skipped_count += 1
break
elif choice == "q":
print("\n 🛑 Quitting. No more files will be deleted.\n")
print_summary(deleted_count, skipped_count, freed_space)
return
else:
print(" Invalid choice. Please enter y, n, or q.")
print_summary(deleted_count, skipped_count, freed_space)
def print_summary(deleted, skipped, freed):
"""Print a final summary of actions taken."""
print(f"{'═' * 60}")
print(f" 📊 Summary")
print(f"{'═' * 60}")
print(f" │ Files deleted : {deleted}")
print(f" │ Files skipped : {skipped}")
print(f" │ Space freed : {format_size(freed)}")
print(f"{'═' * 60}\n")
# ─────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────
def main():
print()
print("╔════════════════════════════════════════════════════════╗")
print("║ 🔎 DUPLICATE FILE DETECTOR 🔍 ║")
print("║ Finds duplicate files by content comparison ║")
print("║ Supports: docs, images, videos, audio, PDFs, etc. ║")
print("╚════════════════════════════════════════════════════════╝")
print()
# Get directory from user
if len(sys.argv) > 1:
directory = sys.argv[1]
else:
directory = input(" 📁 Enter the directory path to scan: ").strip()
# Remove surrounding quotes if present
directory = directory.strip('"').strip("'")
# Validate directory
if not os.path.isdir(directory):
print(f"\n ❌ Error: '{directory}' is not a valid directory.\n")
sys.exit(1)
# Confirm scan
print(f"\n 📌 Target: {os.path.abspath(directory)}")
confirm = input(" ❓ Proceed with scan? (y/n): ").strip().lower()
if confirm != "y":
print(" 🛑 Scan cancelled.\n")
sys.exit(0)
# Run the duplicate finder
start_time = time.time()
duplicates = find_duplicates(directory)
elapsed = time.time() - start_time
print(f" ⏱️ Scan completed in {elapsed:.2f} seconds.\n")
# Handle results
display_and_handle_duplicates(duplicates)
print(" 👋 Done! Goodbye.\n")
if __name__ == "__main__":
main()