Daily-learn/duplicate_finder.py at main · akkinyu2002/Daily-learn · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
"""
Duplicate File Detector
=======================
Detects duplicate files by comparing their content using SHA-256 hashing.
Works with ALL file types: documents, images, videos, audio, PDFs, etc.
Keeps the file with the highest size and asks permission to delete duplicates.

Author  : Auto-generated
Version : 1.0.0
Date    : 2026-03-08

Usage:
    python duplicate_finder.py
    python duplicate_finder.py <directory_path>
"""

import os
import sys
import hashlib
import time
from collections import defaultdict


# ─────────────────────────────────────────────
#  Constants
# ─────────────────────────────────────────────
CHUNK_SIZE = 8192  # Read files in 8 KB chunks for memory efficiency
SUPPORTED_EXTENSIONS = None  # None = all file types supported


# ─────────────────────────────────────────────
#  Helpers
# ─────────────────────────────────────────────
def format_size(size_bytes):
    """Convert bytes to a human-readable string."""
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if size_bytes < 1024:
            return f"{size_bytes:.2f} {unit}"
        size_bytes /= 1024
    return f"{size_bytes:.2f} PB"


def get_file_type(filepath):
    """Return a human-readable file type based on extension."""
    ext = os.path.splitext(filepath)[1].lower()
    type_map = {
        # Documents
        ".pdf": "PDF", ".doc": "Document", ".docx": "Document",
        ".txt": "Text", ".rtf": "Rich Text", ".odt": "Document",
        ".xls": "Spreadsheet", ".xlsx": "Spreadsheet", ".csv": "CSV",
        ".ppt": "Presentation", ".pptx": "Presentation",
        # Images
        ".jpg": "Image", ".jpeg": "Image", ".png": "Image",
        ".gif": "Image", ".bmp": "Image", ".svg": "Image",
        ".webp": "Image", ".ico": "Icon", ".tiff": "Image",
        # Video
        ".mp4": "Video", ".avi": "Video", ".mkv": "Video",
        ".mov": "Video", ".wmv": "Video", ".flv": "Video", ".webm": "Video",
        # Audio
        ".mp3": "Audio", ".wav": "Audio", ".flac": "Audio",
        ".aac": "Audio", ".ogg": "Audio", ".wma": "Audio",
        # Archives
        ".zip": "Archive", ".rar": "Archive", ".7z": "Archive",
        ".tar": "Archive", ".gz": "Archive",
        # Code
        ".py": "Python", ".js": "JavaScript", ".html": "HTML",
        ".css": "CSS", ".java": "Java", ".cpp": "C++",
    }
    return type_map.get(ext, ext.upper()[1:] if ext else "Unknown")


def get_file_hash(filepath):
    """
    Compute the SHA-256 hash of a file's content.
    Reads in chunks to handle very large files without running out of memory.
    """
    sha256 = hashlib.sha256()
    try:
        with open(filepath, "rb") as f:
            while True:
                chunk = f.read(CHUNK_SIZE)
                if not chunk:
                    break
                sha256.update(chunk)
        return sha256.hexdigest()
    except (PermissionError, OSError) as e:
        print(f"  ⚠  Could not read: {filepath} ({e})")
        return None


def collect_files(directory):
    """Recursively collect all files in a directory, skipping empty files."""
    files = []
    skipped_empty = 0
    for root, _, filenames in os.walk(directory):
        for name in filenames:
            full_path = os.path.join(root, name)
            try:
                size = os.path.getsize(full_path)
                if size == 0:
                    skipped_empty += 1
                    continue
                files.append((full_path, size))
            except OSError:
                pass
    if skipped_empty > 0:
        print(f"         (Skipped {skipped_empty} empty files)")
    return files


def group_by_size(files):
    """
    First-pass optimization: group files by size.
    Files with unique sizes cannot be duplicates, so we skip them.
    """
    size_map = defaultdict(list)
    for path, size in files:
        size_map[size].append(path)
    # Only return groups with more than one file (potential duplicates)
    return {size: paths for size, paths in size_map.items() if len(paths) > 1}


def find_duplicates(directory):
    """
    Scan a directory and find duplicate files based on content hash.
    Returns a dict: { hash: [list of file paths] }
    """
    print(f"\n{'═' * 60}")
    print(f"  📂  Scanning: {directory}")
    print(f"{'═' * 60}\n")

    # Step 1: Collect all files
    print(f"  [1/3] Collecting files...")
    all_files = collect_files(directory)
    print(f"         Found {len(all_files)} files.")

    # Show file type breakdown
    if all_files:
        type_counts = defaultdict(int)
        for path, _ in all_files:
            type_counts[get_file_type(path)] += 1
        top_types = sorted(type_counts.items(), key=lambda x: x[1], reverse=True)[:5]
        breakdown = ", ".join(f"{t}: {c}" for t, c in top_types)
        print(f"         Types: {breakdown}")
    print()

    if not all_files:
        print("  ❌  No files found in the specified directory.")
        return {}

    # Step 2: Group by size (quick filter)
    print("  [2/3] Grouping by file size (quick filter)...")
    size_groups = group_by_size(all_files)
    candidates = sum(len(paths) for paths in size_groups.values())
    print(f"         {candidates} files are potential duplicates (same size).\n")

    # Step 3: Hash the candidate files
    print("  [3/3] Computing content hashes...")
    hash_map = defaultdict(list)
    hashed_count = 0

    for size, paths in size_groups.items():
        for path in paths:
            file_hash = get_file_hash(path)
            if file_hash:
                hash_map[file_hash].append(path)
            hashed_count += 1
            # Progress indicator
            if hashed_count % 50 == 0 or hashed_count == candidates:
                print(f"         Hashed {hashed_count}/{candidates} files...", end="\r")

    print(f"         Hashed {hashed_count}/{candidates} files. ✔\n")

    # Filter to only actual duplicates (hash appears more than once)
    duplicates = {h: paths for h, paths in hash_map.items() if len(paths) > 1}
    return duplicates


def display_and_handle_duplicates(duplicates):
    """
    Display duplicate groups and let the user decide what to delete.
    Keeps the file with the largest size in each group.
    """
    if not duplicates:
        print("  ✅  No duplicate files found! Your directory is clean.\n")
        return

    total_groups = len(duplicates)
    total_dupes = sum(len(paths) - 1 for paths in duplicates.values())
    total_recoverable = 0

    print(f"{'═' * 60}")
    print(f"  🔍  Found {total_groups} group(s) of duplicates ({total_dupes} extra files)")
    print(f"{'═' * 60}\n")

    deleted_count = 0
    skipped_count = 0
    freed_space = 0

    for group_num, (file_hash, paths) in enumerate(duplicates.items(), 1):
        # Get sizes for each file in the group
        file_info = []
        for p in paths:
            try:
                size = os.path.getsize(p)
                file_info.append((p, size))
            except OSError:
                file_info.append((p, 0))

        # Sort by size descending — keep the largest
        file_info.sort(key=lambda x: x[1], reverse=True)

        keeper = file_info[0]
        to_delete = file_info[1:]

        recoverable = sum(size for _, size in to_delete)
        total_recoverable += recoverable

        # Display group info
        print(f"  ┌── Group {group_num}/{total_groups} ──────────────────────────")
        print(f"  │  Hash: {file_hash[:16]}...")
        print(f"  │  Files: {len(file_info)} duplicates found")
        print(f"  │  Recoverable space: {format_size(recoverable)}")
        print(f"  │")
        print(f"  │  ✅ KEEP: {keeper[0]}")
        print(f"  │          Size: {format_size(keeper[1])}  |  Type: {get_file_type(keeper[0])}")

        for i, (path, size) in enumerate(to_delete):
            print(f"  │")
            print(f"  │  🗑️  DELETE candidate {i + 1}: {path}")
            print(f"  │          Size: {format_size(size)}  |  Type: {get_file_type(path)}")

        print(f"  └{'─' * 50}\n")

        # Ask user for permission — one file at a time
        for i, (path, size) in enumerate(to_delete):
            print(f"  ❓ Delete this duplicate?")
            print(f"     File : {path}")
            print(f"     Type : {get_file_type(path)}")
            print(f"     Size : {format_size(size)}")

            while True:
                choice = input("     ➤  (y = yes / n = skip / q = quit): ").strip().lower()

                if choice == "y":
                    try:
                        os.remove(path)
                        print(f"     🗑️  Deleted!\n")
                        deleted_count += 1
                        freed_space += size
                    except OSError as e:
                        print(f"     ⚠  Failed to delete: {e}\n")
                    break

                elif choice == "n":
                    print(f"     ⏭️  Skipped.\n")
                    skipped_count += 1
                    break

                elif choice == "q":
                    print("\n  🛑  Quitting. No more files will be deleted.\n")
                    print_summary(deleted_count, skipped_count, freed_space)
                    return

                else:
                    print("     Invalid choice. Please enter y, n, or q.")

    print_summary(deleted_count, skipped_count, freed_space)


def print_summary(deleted, skipped, freed):
    """Print a final summary of actions taken."""
    print(f"{'═' * 60}")
    print(f"  📊  Summary")
    print(f"{'═' * 60}")
    print(f"  │  Files deleted : {deleted}")
    print(f"  │  Files skipped : {skipped}")
    print(f"  │  Space freed   : {format_size(freed)}")
    print(f"{'═' * 60}\n")


# ─────────────────────────────────────────────
#  Main
# ─────────────────────────────────────────────
def main():
    print()
    print("╔════════════════════════════════════════════════════════╗")
    print("║          🔎  DUPLICATE FILE DETECTOR  🔍              ║")
    print("║   Finds duplicate files by content comparison         ║")
    print("║   Supports: docs, images, videos, audio, PDFs, etc.  ║")
    print("╚════════════════════════════════════════════════════════╝")
    print()

    # Get directory from user
    if len(sys.argv) > 1:
        directory = sys.argv[1]
    else:
        directory = input("  📁 Enter the directory path to scan: ").strip()
        # Remove surrounding quotes if present
        directory = directory.strip('"').strip("'")

    # Validate directory
    if not os.path.isdir(directory):
        print(f"\n  ❌  Error: '{directory}' is not a valid directory.\n")
        sys.exit(1)

    # Confirm scan
    print(f"\n  📌 Target: {os.path.abspath(directory)}")
    confirm = input("  ❓ Proceed with scan? (y/n): ").strip().lower()
    if confirm != "y":
        print("  🛑  Scan cancelled.\n")
        sys.exit(0)

    # Run the duplicate finder
    start_time = time.time()
    duplicates = find_duplicates(directory)
    elapsed = time.time() - start_time
    print(f"  ⏱️  Scan completed in {elapsed:.2f} seconds.\n")

    # Handle results
    display_and_handle_duplicates(duplicates)

    print("  👋  Done! Goodbye.\n")


if __name__ == "__main__":
    main()