diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 000000000..7244d73af
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,105 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL Advanced"
+
+on:
+ push:
+ branches: [ "master" ]
+ pull_request:
+ branches: [ "master" ]
+ schedule:
+ - cron: '35 2 * * 2'
+
+jobs:
+ analyze:
+ name: Analyze (${{ matrix.language }})
+ # Runner size impacts CodeQL analysis time. To learn more, please see:
+ # - https://gh.io/recommended-hardware-resources-for-running-codeql
+ # - https://gh.io/supported-runners-and-hardware-resources
+ # - https://gh.io/using-larger-runners (GitHub.com only)
+ # Consider using larger runners or machines with greater resources for possible analysis time improvements.
+ runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+ permissions:
+ # required for all workflows
+ security-events: write
+
+ # required to fetch internal or private CodeQL packs
+ packages: read
+
+ # only required for workflows in private repositories
+ actions: read
+ contents: read
+
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - language: actions
+ build-mode: none
+ - language: java-kotlin
+ build-mode: none # This mode only analyzes Java. Set this to 'autobuild' or 'manual' to analyze Kotlin too.
+ - language: python
+ build-mode: none
+ - language: rust
+ build-mode: none
+ # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'rust', 'swift'
+ # Use `c-cpp` to analyze code written in C, C++ or both
+ # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
+ # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
+ # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
+ # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
+ # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
+ # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ # Add any setup steps before running the `github/codeql-action/init` action.
+ # This includes steps like installing compilers or runtimes (`actions/setup-node`
+ # or others). This is typically only required for manual builds.
+ # - name: Setup runtime (example)
+ # uses: actions/setup-example@v1
+
+ # Initializes the CodeQL tools for scanning.
+ - name: Initialize CodeQL
+ uses: github/codeql-action/init@v4
+ with:
+ languages: ${{ matrix.language }}
+ build-mode: ${{ matrix.build-mode }}
+ # If you wish to specify custom queries, you can do so here or in a config file.
+ # By default, queries listed here will override any specified in a config file.
+ # Prefix the list here with "+" to use these queries and those in the config file.
+
+ # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+ # queries: security-extended,security-and-quality
+
+ # If the analyze step fails for one of the languages you are analyzing with
+ # "We were unable to automatically build your code", modify the matrix above
+ # to set the build mode to "manual" for that language. Then modify this step
+ # to build your code.
+ # ℹ️ Command-line programs to run using the OS shell.
+ # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+ - name: Run manual build steps
+ if: matrix.build-mode == 'manual'
+ shell: bash
+ run: |
+ echo 'If you are using a "manual" build mode for one or more of the' \
+ 'languages you are analyzing, replace this with the commands to build' \
+ 'your code, for example:'
+ echo ' make bootstrap'
+ echo ' make release'
+ exit 1
+
+ - name: Perform CodeQL Analysis
+ uses: github/codeql-action/analyze@v4
+ with:
+ category: "/language:${{matrix.language}}"
diff --git a/Cargo.lock b/Cargo.lock
index 82536866d..c5e9414e3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -610,6 +610,12 @@ dependencies = [
"windows-link 0.2.1",
]
+[[package]]
+name = "base64"
+version = "0.21.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
+
[[package]]
name = "base64"
version = "0.22.1"
@@ -1121,6 +1127,7 @@ dependencies = [
"iana-time-zone",
"js-sys",
"num-traits",
+ "serde",
"wasm-bindgen",
"windows-link 0.2.1",
]
@@ -1638,6 +1645,7 @@ dependencies = [
"humansize",
"indicatif",
"log",
+ "serde_json",
]
[[package]]
@@ -1693,6 +1701,7 @@ dependencies = [
"serde",
"serde_json",
"static_assertions",
+ "strsim 0.11.1",
"symphonia",
"tempfile",
"tokio",
@@ -1734,6 +1743,20 @@ dependencies = [
"winapi",
]
+[[package]]
+name = "czkawka_mcp"
+version = "11.0.1"
+dependencies = [
+ "crossbeam-channel",
+ "czkawka_core",
+ "log",
+ "rmcp",
+ "schemars",
+ "serde",
+ "serde_json",
+ "tokio",
+]
+
[[package]]
name = "darling"
version = "0.14.4"
@@ -2002,6 +2025,12 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+[[package]]
+name = "dyn-clone"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
+
[[package]]
name = "ecb"
version = "0.1.2"
@@ -2048,7 +2077,7 @@ dependencies = [
"failure",
"proc-macro2",
"quote",
- "serde_derive_internals",
+ "serde_derive_internals 0.25.0",
"syn 1.0.109",
]
@@ -3882,7 +3911,7 @@ version = "3.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd266c66b0a0e2d4c6db8e710663fc163a2d33595ce997b6fbda407c8759d344"
dependencies = [
- "base64",
+ "base64 0.22.1",
"fast_image_resize 6.0.0",
"image",
"rustdct",
@@ -6809,6 +6838,38 @@ version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
+[[package]]
+name = "rmcp"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33a0110d28bd076f39e14bfd5b0340216dd18effeb5d02b43215944cc3e5c751"
+dependencies = [
+ "base64 0.21.7",
+ "chrono",
+ "futures",
+ "paste",
+ "pin-project-lite",
+ "rmcp-macros",
+ "schemars",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.18",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "rmcp-macros"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6e2b2fd7497540489fa2db285edd43b7ed14c49157157438664278da6e42a7a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
[[package]]
name = "rodio"
version = "0.22.2"
@@ -7030,6 +7091,30 @@ dependencies = [
"winapi-util",
]
+[[package]]
+name = "schemars"
+version = "0.8.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
+dependencies = [
+ "dyn-clone",
+ "schemars_derive",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars_derive"
+version = "0.8.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde_derive_internals 0.29.1",
+ "syn 2.0.117",
+]
+
[[package]]
name = "scoped-tls"
version = "1.0.1"
@@ -7114,6 +7199,17 @@ dependencies = [
"syn 1.0.109",
]
+[[package]]
+name = "serde_derive_internals"
+version = "0.29.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
[[package]]
name = "serde_json"
version = "1.0.149"
@@ -8158,10 +8254,35 @@ dependencies = [
"pin-project-lite",
"signal-hook-registry",
"socket2",
+ "tokio-macros",
"tracing",
"windows-sys 0.61.2",
]
+[[package]]
+name = "tokio-macros"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
[[package]]
name = "toml"
version = "0.5.11"
@@ -8562,7 +8683,7 @@ version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e419dff010bb12512b0ae9e3d2f318dfbdf0167fde7eb05465134d4e8756076f"
dependencies = [
- "base64",
+ "base64 0.22.1",
"data-url",
"flate2",
"fontdb",
@@ -8589,7 +8710,7 @@ version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d46cf96c5f498d36b7a9693bc6a7075c0bb9303189d61b2249b0dc3d309c07de"
dependencies = [
- "base64",
+ "base64 0.22.1",
"data-url",
"flate2",
"imagesize",
diff --git a/Cargo.toml b/Cargo.toml
index 77c811031..72c24031a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,8 @@ members = [
"czkawka_cli",
"czkawka_gui",
"krokiet",
- "cedinia"
+ "cedinia",
+ "czkawka_mcp",
]
exclude = [
"misc/test_read_perf",
@@ -37,8 +38,8 @@ overflow-checks = true
# But it is used to optimize release builds(and probably also in CI, where time is not so important as in local development)
# Fat lto, generates a lot smaller executable than thin lto
# Also using codegen-units = 1, to generate smaller binaries
-#lto = "fat"
-#codegen-units = 1
+lto = "fat"
+codegen-units = 1
# Optimize all dependencies except application/workspaces, even in debug builds to get reasonable performance e.g. when opening images
[profile.dev.package."*"] # OPT PACKAGES
diff --git a/README.md b/README.md
index dbbdfccbc..59af67209 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@
- **Cache support** - second and further scans should be much faster than the first one
- **Easy to run, easy to compile** - minimal runtime and build dependencies, portable version available
- **CLI frontend** - for easy automation
-- **GUI frontend** - uses Slint or GTK 4 frameworks
+- **GUI frontends** - Slint (Krokiet), GTK 4 (Czkawka GUI), and PySide6/Qt (Kalka)
- **Core library** - allows to reuse functionality in other apps
- **Android app** - experimental touch-friendly frontend for Android devices
- **No spying** - Czkawka does not have access to the Internet, nor does it collect any user information or statistics
@@ -59,6 +59,7 @@ Each tool uses different technologies, so you can find instructions for each of
- [Krokiet GUI (Slint frontend)](krokiet/README.md)
- [Czkawka GUI (GTK frontend)](czkawka_gui/README.md)
+- [Kalka (Qt/PySide6 frontend)](kalka/README.md)
- [Czkawka CLI](czkawka_cli/README.md)
- [Czkawka Core](czkawka_core/README.md)
- [Cedinia](cedinia/README.md)
@@ -68,37 +69,37 @@ Each tool uses different technologies, so you can find instructions for each of
In this comparison remember, that even if app have same features they may work different(e.g. one app may have more
options to choose than other).
-| | Krokiet | Czkawka | Cedinia | FSlint | DupeGuru | Bleachbit |
-|:-------------------------:|:-----------:|:----------------:|:-------:|:------:|:-----------------:|:-----------:|
-| Language | Rust | Rust | Rust | Python | Python/Obj-C | Python |
-| Framework base language | Rust | C | Rust | C | C/C++/Obj-C/Swift | C |
-| Framework | Slint | GTK 4 | Slint | PyGTK2 | Qt 5 (PyQt)/Cocoa | PyGTK3 |
-| OS | Lin,Mac,Win | Lin,Mac,Win | Android | Lin | Lin,Mac,Win | Lin,Mac,Win |
-| Duplicate finder | ✔ | ✔ | ✔ | ✔ | ✔ | |
-| Empty files | ✔ | ✔ | ✔ | ✔ | | |
-| Empty folders | ✔ | ✔ | ✔ | ✔ | | |
-| Temporary files | ✔ | ✔ | ✔ | ✔ | | ✔ |
-| Big files | ✔ | ✔ | ✔ | | | |
-| Similar images | ✔ | ✔ | ✔ | | ✔ | |
-| Similar videos | ✔ | ✔ | | | | |
-| Music duplicates(tags) | ✔ | ✔ | ✔ | | ✔ | |
-| Music duplicates(content) | ✔ | ✔ | ✔ | | | |
-| Invalid symlinks | ✔ | ✔ | ✔ | ✔ | | |
-| Broken files | ✔ | ✔ | ✔ | | | |
-| Invalid names/extensions | ✔ | ✔ | ✔ | ✔ | | |
-| Exif cleaner | ✔ | | ✔ | | | |
-| Video optimizer | ✔ | | | | | |
-| Bad Names | ✔ | | ✔ | | | |
-| Names conflict | | | | ✔ | | |
-| Installed packages | | | | ✔ | | |
-| Bad ID | | | | ✔ | | |
-| Non stripped binaries | | | | ✔ | | |
-| Redundant whitespace | | | | ✔ | | |
-| Overwriting files | | | | ✔ | | ✔ |
-| Portable version | ✔ | ✔ | | | | ✔ |
-| Multiple languages | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ |
-| Cache support | ✔ | ✔ | ✔ | | ✔ | |
-| In active development | Yes | Yes** | Yes*** | No | No* | Yes |
+| | Krokiet | Kalka | Czkawka | Cedinia | FSlint | DupeGuru | Bleachbit |
+|:-------------------------:|:-----------:|:----------------:|:----------------:|:-------:|:------:|:-----------------:|:-----------:|
+| Language | Rust | Python | Rust | Rust | Python | Python/Obj-C | Python |
+| Framework base language | Rust | C++ | C | Rust | C | C/C++/Obj-C/Swift | C |
+| Framework | Slint | PySide6 (Qt 6) | GTK 4 | Slint | PyGTK2 | Qt 5 (PyQt)/Cocoa | PyGTK3 |
+| OS | Lin,Mac,Win | Lin,Mac,Win | Lin,Mac,Win | Android | Lin | Lin,Mac,Win | Lin,Mac,Win |
+| Duplicate finder | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | |
+| Empty files | ✔ | ✔ | ✔ | ✔ | ✔ | | |
+| Empty folders | ✔ | ✔ | ✔ | ✔ | ✔ | | |
+| Temporary files | ✔ | ✔ | ✔ | ✔ | ✔ | | ✔ |
+| Big files | ✔ | ✔ | ✔ | ✔ | | | |
+| Similar images | ✔ | ✔ | ✔ | ✔ | | ✔ | |
+| Similar videos | ✔ | ✔ | ✔ | | | | |
+| Music duplicates(tags) | ✔ | ✔ | ✔ | ✔ | | ✔ | |
+| Music duplicates(content) | ✔ | ✔ | ✔ | ✔ | | | |
+| Invalid symlinks | ✔ | ✔ | ✔ | ✔ | ✔ | | |
+| Broken files | ✔ | ✔ | ✔ | ✔ | | | |
+| Invalid names/extensions | ✔ | ✔ | ✔ | ✔ | ✔ | | |
+| Exif cleaner | ✔ | ✔ | | ✔ | | | |
+| Video optimizer | ✔ | ✔ | | | | | |
+| Bad Names | ✔ | ✔ | | ✔ | | | |
+| Names conflict | | | | | ✔ | | |
+| Installed packages | | | | | ✔ | | |
+| Bad ID | | | | | ✔ | | |
+| Non stripped binaries | | | | | ✔ | | |
+| Redundant whitespace | | | | | ✔ | | |
+| Overwriting files | | | | | ✔ | | ✔ |
+| Portable version | ✔ | ✔ | ✔ | | | | ✔ |
+| Multiple languages | ✔ | | ✔ | ✔ | ✔ | ✔ | ✔ |
+| Cache support | ✔ | ✔ | ✔ | ✔ | | ✔ | |
+| In active development | Yes | Yes | Yes** | Yes*** | No | No* | Yes |
* Few small commits added recently and last version released in 2023
** Czkawka GTK is in maintenance mode receiving only bugfixes
@@ -131,6 +132,7 @@ console apps, then take a look at these:
Czkawka exposes its common functionality through a crate called **`czkawka_core`**, which can be reused by other projects.
It is written in Rust and is used by all Czkawka frontends (`czkawka_gui`, `czkawka_cli`, `krokiet`, `cedinia`).
+The `kalka` frontend uses `czkawka_cli` as its backend, communicating via JSON output and `--json-progress` for real-time progress data.
It is also used by external projects, such as:
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000..034e84803
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,21 @@
+# Security Policy
+
+## Supported Versions
+
+Use this section to tell people about which versions of your project are
+currently being supported with security updates.
+
+| Version | Supported |
+| ------- | ------------------ |
+| 5.1.x | :white_check_mark: |
+| 5.0.x | :x: |
+| 4.0.x | :white_check_mark: |
+| < 4.0 | :x: |
+
+## Reporting a Vulnerability
+
+Use this section to tell people how to report a vulnerability.
+
+Tell them where to go, how often they can expect to get an update on a
+reported vulnerability, what to expect if the vulnerability is accepted or
+declined, etc.
diff --git a/czkawka_cli/Cargo.toml b/czkawka_cli/Cargo.toml
index 2b24c2c56..063f55117 100644
--- a/czkawka_cli/Cargo.toml
+++ b/czkawka_cli/Cargo.toml
@@ -18,6 +18,7 @@ indicatif = "0.18"
crossbeam-channel = { version = "0.5", features = [] }
ctrlc = { version = "3.4", features = ["termination"] }
humansize = "2.1"
+serde_json = "1"
[features]
default = []
diff --git a/czkawka_cli/README.md b/czkawka_cli/README.md
index 3c88431f8..1c9d3c29a 100644
--- a/czkawka_cli/README.md
+++ b/czkawka_cli/README.md
@@ -44,13 +44,57 @@ czkawka_cli dup --help
Example usage:
```shell
-czkawka dup -d /home/rafal -e /home/rafal/Obrazy -m 25 -x 7z rar IMAGE -s hash -f results.txt -D aeo
-czkawka empty-folders -d /home/rafal/rr /home/gateway -f results.txt
-czkawka big -d /home/rafal/ /home/piszczal -e /home/rafal/Roman -n 25 -x VIDEO -f results.txt
-czkawka empty-files -d /home/rafal /home/szczekacz -e /home/rafal/Pulpit -R -f results.txt
-czkawka temp -d /home/rafal/ -E */.git */tmp* *Pulpit -f results.txt -D
-czkawka music -d /home/rafal -e /home/rafal/Pulpit -z "artist,year, ARTISTALBUM, ALBUM___tiTlE" -f results.txt
-czkawka symlinks -d /home/kicikici/ /home/szczek -e /home/kicikici/jestempsem -x jpg -f results.txt
+czkawka_cli dup -d /home/rafal -e /home/rafal/Obrazy -m 25 -x 7z rar IMAGE -s hash -f results.txt -D aeo
+czkawka_cli empty-folders -d /home/rafal/rr /home/gateway -f results.txt
+czkawka_cli big -d /home/rafal/ /home/piszczal -e /home/rafal/Roman -n 25 -x VIDEO -f results.txt
+czkawka_cli empty-files -d /home/rafal /home/szczekacz -e /home/rafal/Pulpit -R -f results.txt
+czkawka_cli temp -d /home/rafal/ -E */.git */tmp* *Pulpit -f results.txt -D
+czkawka_cli music -d /home/rafal -e /home/rafal/Pulpit -z "artist,year, ARTISTALBUM, ALBUM___tiTlE" -f results.txt
+czkawka_cli symlinks -d /home/kicikici/ /home/szczek -e /home/kicikici/jestempsem -x jpg -f results.txt
+```
+
+## JSON output
+
+Results can be saved as compact or pretty-printed JSON:
+
+```shell
+czkawka_cli dup -d /home/user --compact-file-to-save results.json
+czkawka_cli dup -d /home/user --pretty-json-file-to-save results.json
+```
+
+## Machine-readable progress (`--json-progress`)
+
+The `--json-progress` flag outputs real-time progress data as JSON lines to stderr. This is used by GUI frontends (such as the PySide6 frontend) to display accurate progress bars.
+
+Each line is a JSON object with the following structure:
+```json
+{
+ "progress": {
+ "sstage": "DuplicatePreHashing",
+ "checking_method": "Hash",
+ "current_stage_idx": 2,
+ "max_stage_idx": 6,
+ "entries_checked": 50000,
+ "entries_to_check": 94500,
+ "bytes_checked": 204800000,
+ "bytes_to_check": 387072000,
+ "tool_type": "Duplicate"
+ },
+ "stage_name": "Calculating prehashes"
+}
+```
+
+Fields:
+- `sstage` - Internal stage identifier (e.g., `CollectingFiles`, `DuplicateFullHashing`, `SimilarImagesComparingHashes`)
+- `current_stage_idx` / `max_stage_idx` - Current stage number and total stages (e.g., 2/6 for duplicates)
+- `entries_checked` / `entries_to_check` - Files processed and total to process
+- `bytes_checked` / `bytes_to_check` - Bytes processed and total (for hashing stages)
+- `stage_name` - Human-readable stage description
+
+Example usage:
+```shell
+# Capture progress on stderr while saving results to JSON
+czkawka_cli dup -d /home/user --json-progress -N --compact-file-to-save results.json 2>progress.jsonl
```
## LICENSE
diff --git a/czkawka_cli/src/commands.rs b/czkawka_cli/src/commands.rs
index 8980629c1..530884d4f 100644
--- a/czkawka_cli/src/commands.rs
+++ b/czkawka_cli/src/commands.rs
@@ -1,5 +1,7 @@
use std::path::PathBuf;
+use log::error;
+
#[cfg(not(feature = "no_colors"))]
use clap::builder::Styles;
#[cfg(not(feature = "no_colors"))]
@@ -115,6 +117,27 @@ pub enum Commands {
ExifRemover(ExifRemoverArgs),
}
+impl Commands {
+ pub fn get_json_progress(&self) -> bool {
+ match self {
+ Self::Duplicates(a) => a.common_cli_items.json_progress,
+ Self::EmptyFolders(a) => a.common_cli_items.json_progress,
+ Self::BiggestFiles(a) => a.common_cli_items.json_progress,
+ Self::EmptyFiles(a) => a.common_cli_items.json_progress,
+ Self::Temporary(a) => a.common_cli_items.json_progress,
+ Self::SimilarImages(a) => a.common_cli_items.json_progress,
+ Self::SameMusic(a) => a.common_cli_items.json_progress,
+ Self::InvalidSymlinks(a) => a.common_cli_items.json_progress,
+ Self::BrokenFiles(a) => a.common_cli_items.json_progress,
+ Self::SimilarVideos(a) => a.common_cli_items.json_progress,
+ Self::BadExtensions(a) => a.common_cli_items.json_progress,
+ Self::BadNames(a) => a.common_cli_items.json_progress,
+ Self::VideoOptimizer(a) => a.common_cli_items.json_progress,
+ Self::ExifRemover(a) => a.common_cli_items.json_progress,
+ }
+ }
+}
+
#[derive(Debug, clap::Args)]
pub struct DuplicatesArgs {
#[clap(flatten)]
@@ -169,10 +192,17 @@ pub struct DuplicatesArgs {
long,
default_value = "HASH",
value_parser = parse_checking_method_duplicate,
- help = "Search method (NAME, SIZE, HASH)",
- long_help = "Methods to search files.\nNAME - Fast but rarely usable,\nSIZE - Fast but not accurate, checking by the file's size,\nHASH - The slowest method, checking by the hash of the entire file"
+ help = "Search method (NAME, FUZZY_NAME, SIZE, SIZE_NAME, HASH)",
+ long_help = "Methods to search files.\nNAME - Fast but rarely usable, finds files with identical names,\nFUZZY_NAME - Finds files with similar names using Jaro-Winkler distance,\nSIZE - Fast but not accurate, checking by the file's size,\nSIZE_NAME - Checks by both file size and name,\nHASH - The slowest method, checking by the hash of the entire file"
)]
pub search_method: CheckingMethod,
+ #[clap(
+ long,
+ default_value = "0.85",
+ help = "Name similarity threshold for FUZZY_NAME mode (0.0–1.0)",
+ long_help = "Minimum Jaro-Winkler similarity score (0.0–1.0) for two filenames to be considered similar. Higher values require closer matches. Only used with FUZZY_NAME search method."
+ )]
+ pub name_similarity_threshold: f64,
#[clap(flatten)]
pub delete_method: DMethod,
#[clap(
@@ -848,6 +878,14 @@ pub struct CommonCliItems {
long_help = "Disables the cache system. This will make scanning slower but ensures fresh results without cached data."
)]
pub disable_cache: bool,
+ #[clap(
+ long,
+ help = "Output progress as JSON lines to stderr",
+ long_help = "Outputs progress data as JSON lines to stderr for machine consumption. \
+ Each line is a JSON object with fields: sstage, current_stage_idx, max_stage_idx, \
+ entries_checked, entries_to_check, bytes_checked, bytes_to_check, tool_type."
+ )]
+ pub json_progress: bool,
}
#[derive(Debug, clap::Args, Clone, Copy)]
@@ -998,29 +1036,44 @@ pub struct IgnoreSameSize {
impl FileToSave {
pub(crate) fn file_name(&self) -> Option<&str> {
- if let Some(file_name) = &self.file_to_save {
- return file_name.to_str();
+ match &self.file_to_save {
+ Some(file_name) => match file_name.to_str() {
+ Some(s) => Some(s),
+ None => {
+ error!("Output file path contains invalid UTF-8: {:?}", file_name);
+ None
+ }
+ },
+ None => None,
}
-
- None
}
}
impl JsonCompactFileToSave {
pub(crate) fn file_name(&self) -> Option<&str> {
- if let Some(file_name) = &self.compact_file_to_save {
- return file_name.to_str();
+ match &self.compact_file_to_save {
+ Some(file_name) => match file_name.to_str() {
+ Some(s) => Some(s),
+ None => {
+ error!("Compact JSON output file path contains invalid UTF-8: {:?}", file_name);
+ None
+ }
+ },
+ None => None,
}
-
- None
}
}
impl JsonPrettyFileToSave {
pub(crate) fn file_name(&self) -> Option<&str> {
- if let Some(file_name) = &self.pretty_file_to_save {
- return file_name.to_str();
+ match &self.pretty_file_to_save {
+ Some(file_name) => match file_name.to_str() {
+ Some(s) => Some(s),
+ None => {
+ error!("Pretty JSON output file path contains invalid UTF-8: {:?}", file_name);
+ None
+ }
+ },
+ None => None,
}
-
- None
}
}
@@ -1081,10 +1134,11 @@ fn parse_tolerance(src: &str) -> Result {
fn parse_checking_method_duplicate(src: &str) -> Result {
match src.to_ascii_lowercase().as_str() {
"name" => Ok(CheckingMethod::Name),
+ "fuzzy_name" => Ok(CheckingMethod::FuzzyName),
"size" => Ok(CheckingMethod::Size),
"size_name" => Ok(CheckingMethod::SizeName),
"hash" => Ok(CheckingMethod::Hash),
- _ => Err("Couldn't parse the search method (allowed: NAME, SIZE, HASH)"),
+ _ => Err("Couldn't parse the search method (allowed: NAME, FUZZY_NAME, SIZE, SIZE_NAME, HASH)"),
}
}
diff --git a/czkawka_cli/src/main.rs b/czkawka_cli/src/main.rs
index 871183662..777b34ecb 100644
--- a/czkawka_cli/src/main.rs
+++ b/czkawka_cli/src/main.rs
@@ -5,7 +5,7 @@ use std::thread;
use clap::Parser;
use commands::Commands;
-use crossbeam_channel::{Receiver, Sender, unbounded};
+use crossbeam_channel::{Receiver, Sender, bounded};
use czkawka_core::common::config_cache_path::{print_infos_and_warnings, set_config_cache_path};
use czkawka_core::common::consts::DEFAULT_THREAD_SIZE;
use czkawka_core::common::image::register_image_decoding_hooks;
@@ -36,7 +36,7 @@ use crate::commands::{
Args, BadExtensionsArgs, BadNamesArgs, BiggestFilesArgs, BrokenFilesArgs, CommonCliItems, DMethod, DuplicatesArgs, EmptyFilesArgs, EmptyFoldersArgs, ExifRemoverArgs,
InvalidSymlinksArgs, SDMethod, SameMusicArgs, SimilarImagesArgs, SimilarVideosArgs, TemporaryArgs, VideoOptimizerArgs,
};
-use crate::progress::connect_progress;
+use crate::progress::{connect_progress, connect_progress_json};
mod commands;
mod progress;
@@ -45,6 +45,7 @@ mod progress;
pub struct CliOutput {
pub found_any_files: bool,
pub ignored_error_code_on_found: bool,
+ pub had_save_errors: bool,
pub output: String,
}
@@ -65,7 +66,8 @@ fn main() {
debug!("Running command - {command:?}");
}
- let (progress_sender, progress_receiver): (Sender, Receiver) = unbounded();
+ let json_progress = command.get_json_progress();
+ let (progress_sender, progress_receiver): (Sender, Receiver) = bounded(256);
let stop_flag = Arc::new(AtomicBool::new(false));
let store_flag_cloned = stop_flag.clone();
@@ -98,7 +100,11 @@ fn main() {
})
.expect("Error setting Ctrl-C handler");
- connect_progress(&progress_receiver);
+ if json_progress {
+ connect_progress_json(&progress_receiver);
+ } else {
+ connect_progress(&progress_receiver);
+ }
let cli_output = calculate_thread.join().expect("Failed to join calculation thread");
@@ -107,7 +113,9 @@ fn main() {
println!("{}", cli_output.output);
}
- if cli_output.found_any_files && !cli_output.ignored_error_code_on_found {
+ if cli_output.had_save_errors {
+ std::process::exit(1);
+ } else if cli_output.found_any_files && !cli_output.ignored_error_code_on_found {
std::process::exit(11);
} else {
std::process::exit(0);
@@ -122,6 +130,7 @@ fn duplicates(duplicates: DuplicatesArgs, stop_flag: &Arc, progress_
maximal_file_size,
minimal_cached_file_size,
search_method,
+ name_similarity_threshold,
delete_method,
hash_type,
allow_hard_links,
@@ -137,7 +146,8 @@ fn duplicates(duplicates: DuplicatesArgs, stop_flag: &Arc, progress_
minimal_cached_file_size,
minimal_prehash_cache_file_size,
case_sensitive_name_comparison.case_sensitive_name_comparison,
- );
+ )
+ .with_name_similarity_threshold(name_similarity_threshold);
let mut tool = DuplicateFinder::new(params);
set_common_settings(&mut tool, &common_cli_items, Some(reference_directories.reference_directories.as_ref()));
@@ -547,38 +557,46 @@ fn exif_remover(exif_remover: ExifRemoverArgs, stop_flag: &Arc, prog
}
fn save_and_write_results_to_writer(component: &T, common_cli_items: &CommonCliItems) -> CliOutput {
+ let mut had_save_errors = false;
+
if let Some(file_name) = common_cli_items.file_to_save.file_name()
&& let Err(e) = component.print_results_to_file(file_name)
{
error!("Failed to save results to file {e}");
+ had_save_errors = true;
}
if let Some(file_name) = common_cli_items.json_compact_file_to_save.file_name()
&& let Err(e) = component.save_results_to_file_as_json(file_name, false)
{
error!("Failed to save compact json results to file {e}");
+ had_save_errors = true;
}
if let Some(file_name) = common_cli_items.json_pretty_file_to_save.file_name()
&& let Err(e) = component.save_results_to_file_as_json(file_name, true)
{
error!("Failed to save pretty json results to file {e}");
+ had_save_errors = true;
}
let mut buf_writer = std::io::BufWriter::new(Vec::new());
if !common_cli_items.do_not_print.do_not_print_results {
- let _ = component.print_results_to_writer(&mut buf_writer).map_err(|e| {
+ if let Err(e) = component.print_results_to_writer(&mut buf_writer) {
error!("Failed to print results to output: {e}");
- });
+ had_save_errors = true;
+ }
}
if !common_cli_items.do_not_print.do_not_print_messages {
- let _ = component.get_text_messages().print_messages_to_writer(&mut buf_writer).map_err(|e| {
+ if let Err(e) = component.get_text_messages().print_messages_to_writer(&mut buf_writer) {
error!("Failed to print results to output: {e}");
- });
+ had_save_errors = true;
+ }
}
let mut cli_output = CliOutput {
found_any_files: component.found_any_items(),
ignored_error_code_on_found: common_cli_items.ignore_error_code_on_found,
+ had_save_errors,
output: String::new(),
};
diff --git a/czkawka_cli/src/progress.rs b/czkawka_cli/src/progress.rs
index 72952de01..f0b4f77a6 100644
--- a/czkawka_cli/src/progress.rs
+++ b/czkawka_cli/src/progress.rs
@@ -1,3 +1,4 @@
+use std::io::Write;
use std::time::Duration;
use crossbeam_channel::Receiver;
@@ -97,6 +98,35 @@ pub(crate) fn get_progress_message(progress_data: &ProgressData) -> String {
.to_string()
}
+/// Output progress data as JSON lines to stderr for machine consumption.
+/// Each line is a complete JSON object that can be parsed independently.
+pub(crate) fn connect_progress_json(progress_receiver: &Receiver) {
+ let stderr = std::io::stderr();
+ let mut stderr = stderr.lock();
+ while let Ok(progress_data) = progress_receiver.recv() {
+ // Build a JSON object with human-readable stage name included
+ let stage_name = if progress_data.sstage == CurrentStage::CollectingFiles {
+ if progress_data.tool_type == ToolType::EmptyFolders {
+ "Collecting folders".to_string()
+ } else {
+ "Collecting files".to_string()
+ }
+ } else if progress_data.sstage.check_if_loading_saving_cache() {
+ if progress_data.sstage.check_if_loading_cache() {
+ "Loading cache".to_string()
+ } else {
+ "Saving cache".to_string()
+ }
+ } else {
+ get_progress_message(&progress_data)
+ };
+
+ if let Ok(json) = serde_json::to_string(&progress_data) {
+ let _ = writeln!(stderr, "{{\"progress\":{json},\"stage_name\":\"{stage_name}\"}}");
+ }
+ }
+}
+
pub(crate) fn get_progress_bar_for_collect_files() -> ProgressBar {
let pb = ProgressBar::new_spinner();
pb.enable_steady_tick(Duration::from_millis(120));
diff --git a/czkawka_core/Cargo.toml b/czkawka_core/Cargo.toml
index d6a20f988..b969f2ebd 100644
--- a/czkawka_core/Cargo.toml
+++ b/czkawka_core/Cargo.toml
@@ -97,6 +97,7 @@ open = "5.3"
log-panics = { version = "2.1.0", features = ["with-backtrace"] }
deunicode = "1.6.2"
+strsim = "0.11"
glibc_musl_version = "0.1.0"
rand = "0.10.0"
diff --git a/czkawka_core/src/common/cache.rs b/czkawka_core/src/common/cache.rs
index 7ec81914e..0ea0a8367 100644
--- a/czkawka_core/src/common/cache.rs
+++ b/czkawka_core/src/common/cache.rs
@@ -2,7 +2,7 @@
mod cleaning;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
use std::io::{BufReader, BufWriter};
use std::path::Path;
use std::{fs, mem};
@@ -45,7 +45,7 @@ fn get_cache_size(file_name: &Path) -> String {
}
#[fun_time(message = "save_cache_to_file_generalized", level = "debug")]
-pub fn save_cache_to_file_generalized(cache_file_name: &str, hashmap: &BTreeMap, save_also_as_json: bool, minimum_file_size: u64) -> Messages
+pub fn save_cache_to_file_generalized(cache_file_name: &str, hashmap: &HashMap, save_also_as_json: bool, minimum_file_size: u64) -> Messages
where
T: Serialize + ResultEntry + Sized + Send + Sync,
{
@@ -90,10 +90,10 @@ where
}
pub(crate) fn extract_loaded_cache(
- loaded_hash_map: &BTreeMap,
- files_to_check: BTreeMap,
- records_already_cached: &mut BTreeMap,
- non_cached_files_to_check: &mut BTreeMap,
+ loaded_hash_map: &HashMap,
+ files_to_check: HashMap,
+ records_already_cached: &mut HashMap,
+ non_cached_files_to_check: &mut HashMap,
) where
T: Clone,
{
@@ -107,7 +107,7 @@ pub(crate) fn extract_loaded_cache(
}
#[fun_time(message = "load_cache_from_file_generalized_by_path", level = "debug")]
-pub fn load_cache_from_file_generalized_by_path(cache_file_name: &str, delete_outdated_cache: bool, used_files: &BTreeMap) -> (Messages, Option>)
+pub fn load_cache_from_file_generalized_by_path(cache_file_name: &str, delete_outdated_cache: bool, used_files: &HashMap) -> (Messages, Option>)
where
for<'a> T: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone,
{
@@ -130,14 +130,14 @@ where
return (text_messages, None);
};
- debug!("Converting cache Vec into BTreeMap");
+ debug!("Converting cache Vec into HashMap");
let number_of_entries = vec_loaded_entries.len();
let start_time = std::time::Instant::now();
- let map_loaded_entries: BTreeMap = vec_loaded_entries
+ let map_loaded_entries: HashMap = vec_loaded_entries
.into_iter()
.map(|file_entry| (file_entry.get_path().to_string_lossy().into_owned(), file_entry))
.collect();
- debug!("Converted cache Vec({number_of_entries} results) into BTreeMap in {:?}", start_time.elapsed());
+ debug!("Converted cache Vec({number_of_entries} results) into HashMap in {:?}", start_time.elapsed());
(text_messages, Some(map_loaded_entries))
}
@@ -289,9 +289,9 @@ where
pub(crate) fn load_and_split_cache_generalized_by_path(
cache_file_name: &str,
- mut items_to_check: BTreeMap,
+ mut items_to_check: HashMap,
common_data: &mut C,
-) -> (BTreeMap, BTreeMap, BTreeMap)
+) -> (HashMap, HashMap, HashMap)
where
for<'a> K: Deserialize<'a> + ResultEntry + Sized + Send + Sync + Clone,
{
@@ -301,8 +301,8 @@ where
let loaded_hash_map;
- let mut records_already_cached: BTreeMap = Default::default();
- let mut non_cached_files_to_check: BTreeMap = Default::default();
+ let mut records_already_cached: HashMap = Default::default();
+ let mut non_cached_files_to_check: HashMap = Default::default();
let (messages, loaded_items) = load_cache_from_file_generalized_by_path::(cache_file_name, common_data.get_delete_outdated_cache(), &items_to_check);
common_data.get_text_messages_mut().extend_with_another_messages(messages);
@@ -325,14 +325,14 @@ where
(loaded_hash_map, records_already_cached, non_cached_files_to_check)
}
-pub(crate) fn save_and_connect_cache_generalized_by_path(cache_file_name: &str, vec_file_entry: &[K], loaded_hash_map: BTreeMap, common_data: &mut C)
+pub(crate) fn save_and_connect_cache_generalized_by_path(cache_file_name: &str, vec_file_entry: &[K], loaded_hash_map: HashMap, common_data: &mut C)
where
K: Serialize + ResultEntry + Sized + Send + Sync + Clone,
{
if !common_data.get_use_cache() {
return;
}
- let mut all_results: BTreeMap = Default::default();
+ let mut all_results: HashMap = Default::default();
for file_entry in vec_file_entry.iter().cloned() {
all_results.insert(file_entry.get_path().to_string_lossy().to_string(), file_entry);
@@ -347,7 +347,7 @@ where
#[cfg(test)]
mod tests {
- use std::collections::BTreeMap;
+ use std::collections::HashMap;
use std::fs;
use std::path::PathBuf;
use std::sync::Once;
@@ -408,17 +408,17 @@ mod tests {
#[test]
fn test_extract_loaded_cache() {
- let mut loaded_cache = BTreeMap::new();
+ let mut loaded_cache = HashMap::new();
loaded_cache.insert("file1".to_string(), TestEntry::new("/tmp/file1", 100, 1000, 10));
loaded_cache.insert("file2".to_string(), TestEntry::new("/tmp/file2", 200, 2000, 20));
- let mut files_to_check = BTreeMap::new();
+ let mut files_to_check = HashMap::new();
files_to_check.insert("file1".to_string(), TestEntry::new("/tmp/file1", 100, 1000, 10));
files_to_check.insert("file3".to_string(), TestEntry::new("/tmp/file3", 300, 3000, 30));
files_to_check.insert("file2".to_string(), TestEntry::new("/tmp/file2", 200, 2000, 20));
- let mut records_already_cached = BTreeMap::new();
- let mut non_cached_files_to_check = BTreeMap::new();
+ let mut records_already_cached = HashMap::new();
+ let mut non_cached_files_to_check = HashMap::new();
extract_loaded_cache(&loaded_cache, files_to_check, &mut records_already_cached, &mut non_cached_files_to_check);
@@ -433,13 +433,13 @@ mod tests {
#[test]
fn test_extract_loaded_cache_empty() {
- let loaded_cache: BTreeMap = BTreeMap::new();
- let mut files_to_check = BTreeMap::new();
+ let loaded_cache: HashMap = HashMap::new();
+ let mut files_to_check = HashMap::new();
files_to_check.insert("file1".to_string(), TestEntry::new("/tmp/file1", 100, 1000, 10));
files_to_check.insert("file2".to_string(), TestEntry::new("/tmp/file2", 200, 2000, 20));
- let mut records_already_cached = BTreeMap::new();
- let mut non_cached_files_to_check = BTreeMap::new();
+ let mut records_already_cached = HashMap::new();
+ let mut non_cached_files_to_check = HashMap::new();
extract_loaded_cache(&loaded_cache, files_to_check, &mut records_already_cached, &mut non_cached_files_to_check);
@@ -449,16 +449,16 @@ mod tests {
#[test]
fn test_extract_loaded_cache_all_cached() {
- let mut loaded_cache = BTreeMap::new();
+ let mut loaded_cache = HashMap::new();
loaded_cache.insert("file1".to_string(), TestEntry::new("/tmp/file1", 100, 1000, 10));
loaded_cache.insert("file2".to_string(), TestEntry::new("/tmp/file2", 200, 2000, 20));
- let mut files_to_check = BTreeMap::new();
+ let mut files_to_check = HashMap::new();
files_to_check.insert("file1".to_string(), TestEntry::new("/tmp/file1", 100, 1000, 10));
files_to_check.insert("file2".to_string(), TestEntry::new("/tmp/file2", 200, 2000, 20));
- let mut records_already_cached = BTreeMap::new();
- let mut non_cached_files_to_check = BTreeMap::new();
+ let mut records_already_cached = HashMap::new();
+ let mut non_cached_files_to_check = HashMap::new();
extract_loaded_cache(&loaded_cache, files_to_check, &mut records_already_cached, &mut non_cached_files_to_check);
@@ -474,7 +474,7 @@ mod tests {
fs::write(&temp_file, "test content").unwrap();
let metadata = fs::metadata(&temp_file).unwrap();
- let mut cache_to_save = BTreeMap::new();
+ let mut cache_to_save = HashMap::new();
cache_to_save.insert(
temp_file.to_string_lossy().to_string(),
TestEntry::new(temp_file.to_str().unwrap(), metadata.len(), metadata.modified().unwrap().elapsed().unwrap().as_secs(), 42),
@@ -524,7 +524,7 @@ mod tests {
));
// Convert to flat map for saving
- let mut flat_cache = BTreeMap::new();
+ let mut flat_cache = HashMap::new();
for entries in cache_to_save.values() {
for entry in entries {
flat_cache.insert(entry.path.to_string_lossy().to_string(), entry.clone());
@@ -552,7 +552,7 @@ mod tests {
let temp_file = temp_dir.path().join("test_file.txt");
fs::write(&temp_file, "test").unwrap();
- let mut cache_to_save = BTreeMap::new();
+ let mut cache_to_save = HashMap::new();
cache_to_save.insert("small_file".to_string(), TestEntry::new("/tmp/small", 10, 1000, 1));
cache_to_save.insert("large_file".to_string(), TestEntry::new("/tmp/large", 1000, 2000, 2));
@@ -581,7 +581,7 @@ mod tests {
fs::write(&temp_file, "test content").unwrap();
let metadata = fs::metadata(&temp_file).unwrap();
- let mut cache_to_save = BTreeMap::new();
+ let mut cache_to_save = HashMap::new();
cache_to_save.insert(
temp_file.to_string_lossy().to_string(),
TestEntry::new(temp_file.to_str().unwrap(), metadata.len(), metadata.modified().unwrap().elapsed().unwrap().as_secs(), 42),
@@ -597,7 +597,7 @@ mod tests {
// Create new files_to_check with updated metadata
let new_metadata = fs::metadata(&temp_file).unwrap();
- let mut files_to_check = BTreeMap::new();
+ let mut files_to_check = HashMap::new();
files_to_check.insert(
temp_file.to_string_lossy().to_string(),
TestEntry::new(
@@ -621,7 +621,7 @@ mod tests {
fn test_load_nonexistent_cache() {
setup_cache_path();
let cache_name = format!("nonexistent_cache_{}", std::process::id());
- let files_to_check: BTreeMap = BTreeMap::new();
+ let files_to_check: HashMap = HashMap::new();
let (messages, loaded_cache) = load_cache_from_file_generalized_by_path::(&cache_name, false, &files_to_check);
@@ -636,7 +636,7 @@ mod tests {
let temp_file = temp_dir.path().join("test_file.txt");
fs::write(&temp_file, "test content").unwrap();
- let mut cache_to_save = BTreeMap::new();
+ let mut cache_to_save = HashMap::new();
cache_to_save.insert("test_key".to_string(), TestEntry::new("/tmp/test", 100, 1000, 42));
// Save cache with JSON enabled
diff --git a/czkawka_core/src/common/dir_traversal.rs b/czkawka_core/src/common/dir_traversal.rs
index b682a614a..2c2ffb072 100644
--- a/czkawka_core/src/common/dir_traversal.rs
+++ b/czkawka_core/src/common/dir_traversal.rs
@@ -268,7 +268,7 @@ where
}
}
}
- file_results.sort_by_cached_key(|fe| fe.path.to_string_lossy().to_string());
+ file_results.sort_unstable_by(|a, b| a.path.cmp(&b.path));
for fe in file_results {
let key = (self.group_by)(&fe);
grouped_file_entries.entry(key).or_default().push(fe);
@@ -281,9 +281,10 @@ where
return DirTraversalResult::Stopped;
}
+ let dir_max_len = std::thread::available_parallelism().map_or(2, |p| (p.get() / 2).max(2));
let segments: Vec<_> = folders_to_check
.into_par_iter()
- .with_max_len(2) // Avoiding checking too many folders in batch
+ .with_max_len(dir_max_len)
.map(|current_folder| {
let mut dir_result = Vec::new();
let mut warnings = Vec::new();
@@ -350,7 +351,7 @@ where
for (segment, warnings, mut fe_result) in segments {
folders_to_check.extend(segment);
all_warnings.extend(warnings);
- fe_result.sort_by_cached_key(|fe| fe.path.to_string_lossy().to_string());
+ fe_result.sort_unstable_by(|a, b| a.path.cmp(&b.path));
for fe in fe_result {
let key = (self.group_by)(&fe);
grouped_file_entries.entry(key).or_default().push(fe);
diff --git a/czkawka_core/src/common/directories.rs b/czkawka_core/src/common/directories.rs
index f7955421c..d3bf2c433 100644
--- a/czkawka_core/src/common/directories.rs
+++ b/czkawka_core/src/common/directories.rs
@@ -222,6 +222,7 @@ impl Directories {
// Get device IDs for included directories, probably ther better solution would be to get one id per directory, but this is faster, but a little less precise
#[cfg(target_family = "unix")]
if self.exclude_other_filesystems() {
+ self.included_dev_ids.clear();
for d in &self.included_directories {
match fs::metadata(d) {
Ok(m) => self.included_dev_ids.push(m.dev()),
diff --git a/czkawka_core/src/common/items.rs b/czkawka_core/src/common/items.rs
index f49ab916b..09532f300 100644
--- a/czkawka_core/src/common/items.rs
+++ b/czkawka_core/src/common/items.rs
@@ -72,6 +72,8 @@ impl ExcludedItems {
checked_expressions.push(expression);
}
+ self.expressions.clear();
+ self.connected_expressions.clear();
for checked_expression in &checked_expressions {
let item = new_excluded_item(checked_expression);
self.expressions.push(item.expression.clone());
diff --git a/czkawka_core/src/common/model.rs b/czkawka_core/src/common/model.rs
index 41919a49d..aba1f5cc5 100644
--- a/czkawka_core/src/common/model.rs
+++ b/czkawka_core/src/common/model.rs
@@ -6,7 +6,7 @@ use xxhash_rust::xxh3::Xxh3;
use crate::common::traits::ResultEntry;
use crate::tools::duplicate::MyHasher;
-#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Default, Serialize, Deserialize)]
pub enum ToolType {
Duplicate,
EmptyFolders,
@@ -37,6 +37,7 @@ pub enum CheckingMethod {
#[default]
None,
Name,
+ FuzzyName,
SizeName,
Size,
Hash,
diff --git a/czkawka_core/src/common/progress_data.rs b/czkawka_core/src/common/progress_data.rs
index d23abe7e8..e927534cf 100644
--- a/czkawka_core/src/common/progress_data.rs
+++ b/czkawka_core/src/common/progress_data.rs
@@ -1,4 +1,5 @@
use log::error;
+use serde::Serialize;
use crate::common::model::{CheckingMethod, ToolType};
// Empty files
@@ -66,7 +67,7 @@ use crate::common::model::{CheckingMethod, ToolType};
// Deleting files
// Renaming files
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, Serialize)]
pub struct ProgressData {
pub sstage: CurrentStage,
pub checking_method: CheckingMethod,
@@ -95,7 +96,7 @@ impl ProgressData {
}
}
-#[derive(Debug, Clone, Copy, Eq, PartialEq)]
+#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize)]
pub enum CurrentStage {
DeletingFiles,
RenamingFiles,
@@ -176,7 +177,7 @@ impl ProgressData {
let tool_type_checking_method: Option = match self.checking_method {
CheckingMethod::AudioTags | CheckingMethod::AudioContent => Some(ToolType::SameMusic),
- CheckingMethod::Name | CheckingMethod::SizeName | CheckingMethod::Size | CheckingMethod::Hash => Some(ToolType::Duplicate),
+ CheckingMethod::Name | CheckingMethod::FuzzyName | CheckingMethod::SizeName | CheckingMethod::Size | CheckingMethod::Hash => Some(ToolType::Duplicate),
CheckingMethod::None => None,
};
if let Some(tool_type) = tool_type_checking_method {
diff --git a/czkawka_core/src/tools/broken_files/core.rs b/czkawka_core/src/tools/broken_files/core.rs
index 5cfff05b6..0da01a24c 100644
--- a/czkawka_core/src/tools/broken_files/core.rs
+++ b/czkawka_core/src/tools/broken_files/core.rs
@@ -1,4 +1,4 @@
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::fs::File;
use std::path::Path;
use std::process::Command;
@@ -243,12 +243,12 @@ impl BrokenFiles {
}
#[fun_time(message = "load_cache", level = "debug")]
- fn load_cache(&mut self) -> (BTreeMap, BTreeMap, BTreeMap) {
+ fn load_cache(&mut self) -> (HashMap, HashMap, HashMap) {
load_and_split_cache_generalized_by_path(&get_broken_files_cache_file(), mem::take(&mut self.files_to_check), self)
}
#[fun_time(message = "save_to_cache", level = "debug")]
- fn save_to_cache(&mut self, vec_file_entry: &[BrokenEntry], loaded_hash_map: BTreeMap) {
+ fn save_to_cache(&mut self, vec_file_entry: &[BrokenEntry], loaded_hash_map: HashMap) {
save_and_connect_cache_generalized_by_path(&get_broken_files_cache_file(), vec_file_entry, loaded_hash_map, self);
}
diff --git a/czkawka_core/src/tools/broken_files/mod.rs b/czkawka_core/src/tools/broken_files/mod.rs
index 30d861308..3f677d66e 100644
--- a/czkawka_core/src/tools/broken_files/mod.rs
+++ b/czkawka_core/src/tools/broken_files/mod.rs
@@ -5,7 +5,7 @@ pub mod core;
mod tests;
pub mod traits;
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::time::Duration;
@@ -88,7 +88,7 @@ impl BrokenFilesParameters {
pub struct BrokenFiles {
common_data: CommonToolData,
information: Info,
- files_to_check: BTreeMap,
+ files_to_check: HashMap,
broken_files: Vec,
params: BrokenFilesParameters,
}
diff --git a/czkawka_core/src/tools/duplicate/core.rs b/czkawka_core/src/tools/duplicate/core.rs
index d78ad9464..14a4e5cee 100644
--- a/czkawka_core/src/tools/duplicate/core.rs
+++ b/czkawka_core/src/tools/duplicate/core.rs
@@ -1,4 +1,4 @@
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
use std::path::Path;
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
@@ -8,9 +8,9 @@ use std::{mem, thread};
use crossbeam_channel::Sender;
use fun_time::fun_time;
use humansize::{BINARY, format_size};
-use indexmap::IndexMap;
use log::debug;
use rayon::prelude::*;
+use strsim;
use crate::common::cache::{CACHE_DUPLICATE_VERSION, load_cache_from_file_generalized_by_size, save_cache_to_file_generalized};
use crate::common::dir_traversal::{DirTraversalBuilder, DirTraversalResult};
@@ -32,10 +32,12 @@ impl DuplicateFinder {
files_with_identical_size: Default::default(),
files_with_identical_size_names: Default::default(),
files_with_identical_hashes: Default::default(),
+ files_with_fuzzy_names: Default::default(),
files_with_identical_names_referenced: Default::default(),
files_with_identical_size_names_referenced: Default::default(),
files_with_identical_size_referenced: Default::default(),
files_with_identical_hashes_referenced: Default::default(),
+ files_with_fuzzy_names_referenced: Default::default(),
params,
}
}
@@ -102,7 +104,7 @@ impl DuplicateFinder {
})
.collect::)>>();
for (fe, vec_fe) in vec {
- self.files_with_identical_names_referenced.insert(fe.path.to_string_lossy().to_string(), (fe, vec_fe));
+ self.files_with_identical_names_referenced.insert(fe.path.to_string_lossy().into_owned(), (fe, vec_fe));
}
}
self.calculate_name_stats();
@@ -113,6 +115,132 @@ impl DuplicateFinder {
}
}
+ #[fun_time(message = "check_files_fuzzy_name", level = "debug")]
+ pub(crate) fn check_files_fuzzy_name(&mut self, stop_flag: &Arc, progress_sender: Option<&Sender>) -> WorkContinueStatus {
+ // Collect all files grouped by extension for performance (files with different extensions rarely match)
+ let group_by_func = |fe: &FileEntry| fe.path.extension().map(|e| e.to_string_lossy().to_lowercase()).unwrap_or_default();
+
+ let result = DirTraversalBuilder::new()
+ .common_data(&self.common_data)
+ .group_by(group_by_func)
+ .stop_flag(stop_flag)
+ .progress_sender(progress_sender)
+ .checking_method(CheckingMethod::FuzzyName)
+ .build()
+ .run();
+
+ match result {
+ DirTraversalResult::SuccessFiles { grouped_file_entries, warnings } => {
+ self.common_data.text_messages.warnings.extend(warnings);
+
+ let threshold = self.get_params().name_similarity_threshold;
+ let case_sensitive = self.get_params().case_sensitive_name_comparison;
+ let mut all_groups: Vec> = Vec::new();
+
+ for (_ext, files) in grouped_file_entries {
+ if files.len() < 2 {
+ continue;
+ }
+ if check_if_stop_received(stop_flag) {
+ return WorkContinueStatus::Stop;
+ }
+
+ let names: Vec = files
+ .iter()
+ .map(|fe| {
+ let name = fe
+ .path
+ .file_stem()
+ .unwrap_or_else(|| panic!("Found invalid file_stem \"{}\"", fe.path.to_string_lossy()))
+ .to_string_lossy();
+ if case_sensitive { name.to_string() } else { name.to_lowercase() }
+ })
+ .collect();
+
+ // Union-Find for grouping similar filenames
+ let n = files.len();
+ let mut parent: Vec = (0..n).collect();
+
+ fn find(parent: &mut Vec, i: usize) -> usize {
+ if parent[i] != i {
+ parent[i] = find(parent, parent[i]);
+ }
+ parent[i]
+ }
+
+ fn union(parent: &mut Vec, a: usize, b: usize) {
+ let ra = find(parent, a);
+ let rb = find(parent, b);
+ if ra != rb {
+ parent[ra] = rb;
+ }
+ }
+
+ for i in 0..n {
+ for j in (i + 1)..n {
+ let similarity = strsim::jaro_winkler(&names[i], &names[j]);
+ if similarity >= threshold {
+ union(&mut parent, i, j);
+ }
+ }
+ }
+
+ // Collect groups
+ let mut groups: HashMap> = HashMap::new();
+ for i in 0..n {
+ let root = find(&mut parent, i);
+ groups.entry(root).or_default().push(i);
+ }
+
+ for (_root, indices) in groups {
+ if indices.len() > 1 {
+ let group: Vec = indices.into_iter().map(|idx| files[idx].clone().into_duplicate_entry()).collect();
+ all_groups.push(group);
+ }
+ }
+ }
+
+ self.files_with_fuzzy_names = all_groups;
+
+ if self.common_data.use_reference_folders {
+ let groups = mem::take(&mut self.files_with_fuzzy_names);
+ self.files_with_fuzzy_names_referenced = groups
+ .into_iter()
+ .filter_map(|vec_file_entry| {
+ let (mut files_from_referenced_folders, normal_files): (Vec<_>, Vec<_>) = vec_file_entry
+ .into_iter()
+ .partition(|e| self.common_data.directories.is_in_referenced_directory(e.get_path()));
+
+ if normal_files.is_empty() {
+ None
+ } else {
+ files_from_referenced_folders.pop().map(|file| (file, normal_files))
+ }
+ })
+ .collect();
+ }
+
+ self.calculate_fuzzy_name_stats();
+ WorkContinueStatus::Continue
+ }
+ DirTraversalResult::Stopped => WorkContinueStatus::Stop,
+ }
+ }
+
+ fn calculate_fuzzy_name_stats(&mut self) {
+ if self.common_data.use_reference_folders {
+ for (_fe, vector) in &self.files_with_fuzzy_names_referenced {
+ self.information.number_of_duplicated_files_by_fuzzy_name += vector.len();
+ self.information.number_of_groups_by_fuzzy_name += 1;
+ }
+ } else {
+ for vector in &self.files_with_fuzzy_names {
+ self.information.number_of_duplicated_files_by_fuzzy_name += vector.len() - 1;
+ self.information.number_of_groups_by_fuzzy_name += 1;
+ }
+ }
+ }
+
fn calculate_name_stats(&mut self) {
if self.common_data.use_reference_folders {
for (_fe, vector) in self.files_with_identical_names_referenced.values() {
@@ -195,7 +323,7 @@ impl DuplicateFinder {
.collect::)>>();
for (fe, vec_fe) in vec {
self.files_with_identical_size_names_referenced
- .insert((fe.size, fe.path.to_string_lossy().to_string()), (fe, vec_fe));
+ .insert((fe.size, fe.path.to_string_lossy().into_owned()), (fe, vec_fe));
}
}
self.calculate_size_name_stats();
@@ -367,16 +495,16 @@ impl DuplicateFinder {
fn prehash_save_cache_at_exit(
&mut self,
loaded_hash_map: BTreeMap>,
- pre_hash_results: Vec<(u64, BTreeMap>, Vec)>,
+ pre_hash_results: Vec<(u64, HashMap>, Vec)>,
) {
if self.get_params().use_prehash_cache {
// All results = records already cached + computed results
- let mut save_cache_to_hashmap: BTreeMap = Default::default();
+ let mut save_cache_to_hashmap: HashMap = Default::default();
for (size, vec_file_entry) in loaded_hash_map {
if size >= self.get_params().minimal_prehash_cache_file_size {
for file_entry in vec_file_entry {
- save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().to_string(), file_entry);
+ save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().into_owned(), file_entry);
}
}
}
@@ -385,7 +513,7 @@ impl DuplicateFinder {
if size >= self.get_params().minimal_prehash_cache_file_size {
for vec_file_entry in hash_map.into_values() {
for file_entry in vec_file_entry {
- save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().to_string(), file_entry);
+ save_cache_to_hashmap.insert(file_entry.path.to_string_lossy().into_owned(), file_entry);
}
}
}
@@ -436,12 +564,13 @@ impl DuplicateFinder {
let non_cached_files_to_check: Vec<(u64, Vec)> = non_cached_files_to_check.into_iter().collect();
debug!("Starting calculating prehash");
+ let rayon_max_len = std::thread::available_parallelism().map_or(3, |p| p.get().max(3));
#[expect(clippy::type_complexity)]
- let pre_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check
+ let pre_hash_results: Vec<(u64, HashMap>, Vec)> = non_cached_files_to_check
.into_par_iter()
- .with_max_len(3) // Vectors and BTreeMaps for really big inputs, leave some jobs to 0 thread, to avoid that I minimized max tasks for each thread to 3, which improved performance
+ .with_max_len(rayon_max_len)
.map(|(size, vec_file_entry)| {
- let mut hashmap_with_hash: BTreeMap> = Default::default();
+ let mut hashmap_with_hash: HashMap> = Default::default();
let mut errors: Vec = Vec::new();
THREAD_BUFFER.with_borrow_mut(|buffer| {
@@ -513,16 +642,26 @@ impl DuplicateFinder {
for (size, mut vec_file_entry) in used_map {
if let Some(cached_vec_file_entry) = loaded_hash_map.get(&size) {
- // TODO maybe hashmap is not needed when using < 4 elements
- let mut cached_path_entries: IndexMap<&Path, DuplicateEntry> = IndexMap::new();
- for file_entry in cached_vec_file_entry {
- cached_path_entries.insert(&file_entry.path, file_entry.clone());
- }
- for file_entry in vec_file_entry {
- if let Some(cached_file_entry) = cached_path_entries.swap_remove(file_entry.path.as_path()) {
- records_already_cached.entry(size).or_default().push(cached_file_entry);
- } else {
- non_cached_files_to_check.entry(size).or_default().push(file_entry);
+ if cached_vec_file_entry.len() < 4 {
+ // For very small groups, linear scan is faster than building a map
+ for file_entry in vec_file_entry {
+ if let Some(cached) = cached_vec_file_entry.iter().find(|ce| ce.path == file_entry.path) {
+ records_already_cached.entry(size).or_default().push(cached.clone());
+ } else {
+ non_cached_files_to_check.entry(size).or_default().push(file_entry);
+ }
+ }
+ } else {
+ let mut cached_path_entries: HashMap<&Path, DuplicateEntry> = HashMap::with_capacity(cached_vec_file_entry.len());
+ for file_entry in cached_vec_file_entry {
+ cached_path_entries.insert(&file_entry.path, file_entry.clone());
+ }
+ for file_entry in vec_file_entry {
+ if let Some(cached_file_entry) = cached_path_entries.remove(file_entry.path.as_path()) {
+ records_already_cached.entry(size).or_default().push(cached_file_entry);
+ } else {
+ non_cached_files_to_check.entry(size).or_default().push(file_entry);
+ }
}
}
} else {
@@ -576,7 +715,7 @@ impl DuplicateFinder {
fn full_hashing_save_cache_at_exit(
&mut self,
records_already_cached: BTreeMap>,
- full_hash_results: &mut Vec<(u64, BTreeMap>, Vec)>,
+ full_hash_results: &mut Vec<(u64, HashMap>, Vec)>,
loaded_hash_map: BTreeMap>,
) {
if !self.common_data.use_cache {
@@ -593,7 +732,7 @@ impl DuplicateFinder {
}
}
// Size doesn't exists add results to files
- let mut temp_hashmap: BTreeMap> = Default::default();
+ let mut temp_hashmap: HashMap> = Default::default();
for file_entry in vec_file_entry {
temp_hashmap.entry(file_entry.hash.clone()).or_default().push(file_entry);
}
@@ -601,16 +740,16 @@ impl DuplicateFinder {
}
// Must save all results to file, old loaded from file with all currently counted results
- let mut all_results: BTreeMap = Default::default();
+ let mut all_results: HashMap = Default::default();
for (_size, vec_file_entry) in loaded_hash_map {
for file_entry in vec_file_entry {
- all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry);
+ all_results.insert(file_entry.path.to_string_lossy().into_owned(), file_entry);
}
}
- for (_size, hashmap, _errors) in full_hash_results {
+ for (_size, hashmap, _errors) in full_hash_results.iter() {
for vec_file_entry in hashmap.values() {
for file_entry in vec_file_entry {
- all_results.insert(file_entry.path.to_string_lossy().to_string(), file_entry.clone());
+ all_results.insert(file_entry.path.to_string_lossy().into_owned(), file_entry.clone());
}
}
}
@@ -659,11 +798,12 @@ impl DuplicateFinder {
"Starting full hashing of {} files",
non_cached_files_to_check.iter().map(|(_size, v)| v.len() as u64).sum::()
);
- let mut full_hash_results: Vec<(u64, BTreeMap>, Vec)> = non_cached_files_to_check
+ let rayon_max_len = std::thread::available_parallelism().map_or(3, |p| p.get().max(3));
+ let mut full_hash_results: Vec<(u64, HashMap>, Vec)> = non_cached_files_to_check
.into_par_iter()
- .with_max_len(3)
+ .with_max_len(rayon_max_len)
.map(|(size, vec_file_entry)| {
- let mut hashmap_with_hash: BTreeMap> = Default::default();
+ let mut hashmap_with_hash: HashMap> = Default::default();
let mut errors: Vec = Vec::new();
THREAD_BUFFER.with_borrow_mut(|buffer| {
diff --git a/czkawka_core/src/tools/duplicate/mod.rs b/czkawka_core/src/tools/duplicate/mod.rs
index 409e87a19..76597caf0 100644
--- a/czkawka_core/src/tools/duplicate/mod.rs
+++ b/czkawka_core/src/tools/duplicate/mod.rs
@@ -76,6 +76,8 @@ pub struct Info {
pub number_of_duplicated_files_by_name: usize,
pub number_of_groups_by_size_name: usize,
pub number_of_duplicated_files_by_size_name: usize,
+ pub number_of_groups_by_fuzzy_name: usize,
+ pub number_of_duplicated_files_by_fuzzy_name: usize,
pub lost_space_by_size: u64,
pub lost_space_by_hash: u64,
pub scanning_time: Duration,
@@ -89,6 +91,7 @@ pub struct DuplicateFinderParameters {
pub minimal_cache_file_size: u64,
pub minimal_prehash_cache_file_size: u64,
pub case_sensitive_name_comparison: bool,
+ pub name_similarity_threshold: f64,
}
impl DuplicateFinderParameters {
@@ -107,8 +110,13 @@ impl DuplicateFinderParameters {
minimal_cache_file_size,
minimal_prehash_cache_file_size,
case_sensitive_name_comparison,
+ name_similarity_threshold: 0.85,
}
}
+ pub fn with_name_similarity_threshold(mut self, threshold: f64) -> Self {
+ self.name_similarity_threshold = threshold.clamp(0.0, 1.0);
+ self
+ }
}
pub struct DuplicateFinder {
@@ -122,6 +130,8 @@ pub struct DuplicateFinder {
files_with_identical_size: BTreeMap>,
// File Size, next grouped by file size, next grouped by hash
files_with_identical_hashes: BTreeMap>>,
+ // Fuzzy name groups: group_id -> Vec
+ files_with_fuzzy_names: Vec>,
// File Size, File Entry
files_with_identical_names_referenced: BTreeMap)>,
// File (Size, Name), File Entry
@@ -130,6 +140,8 @@ pub struct DuplicateFinder {
files_with_identical_size_referenced: BTreeMap)>,
// File Size, next grouped by file size, next grouped by hash
files_with_identical_hashes_referenced: BTreeMap)>>,
+ // Fuzzy name groups with reference: (reference, Vec)
+ files_with_fuzzy_names_referenced: Vec<(DuplicateEntry, Vec)>,
params: DuplicateFinderParameters,
}
@@ -218,6 +230,14 @@ impl DuplicateFinder {
pub fn get_files_with_identical_size_names_referenced(&self) -> &BTreeMap<(u64, String), (DuplicateEntry, Vec)> {
&self.files_with_identical_size_names_referenced
}
+
+ pub fn get_files_with_fuzzy_names(&self) -> &Vec> {
+ &self.files_with_fuzzy_names
+ }
+
+ pub fn get_files_with_fuzzy_names_referenced(&self) -> &Vec<(DuplicateEntry, Vec)> {
+ &self.files_with_fuzzy_names_referenced
+ }
}
pub(crate) fn hash_calculation_limit(buffer: &mut [u8], file_entry: &DuplicateEntry, hash_type: HashType, limit: u64, size_counter: &Arc) -> Result {
diff --git a/czkawka_core/src/tools/duplicate/traits.rs b/czkawka_core/src/tools/duplicate/traits.rs
index 5f1358b90..9f7ce811b 100644
--- a/czkawka_core/src/tools/duplicate/traits.rs
+++ b/czkawka_core/src/tools/duplicate/traits.rs
@@ -25,6 +25,7 @@ impl DeletingItems for DuplicateFinder {
let files_to_delete = match self.get_params().check_method {
CheckingMethod::Name => self.files_with_identical_names.values().cloned().collect::>(),
+ CheckingMethod::FuzzyName => self.files_with_fuzzy_names.clone(),
CheckingMethod::SizeName => self.files_with_identical_size_names.values().cloned().collect::>(),
CheckingMethod::Hash => self.files_with_identical_hashes.values().flatten().cloned().collect::>(),
CheckingMethod::Size => self.files_with_identical_size.values().cloned().collect::>(),
@@ -52,6 +53,12 @@ impl Search for DuplicateFinder {
return;
}
}
+ CheckingMethod::FuzzyName => {
+ self.common_data.stopped_search = self.check_files_fuzzy_name(stop_flag, progress_sender) == WorkContinueStatus::Stop;
+ if self.common_data.stopped_search {
+ return;
+ }
+ }
CheckingMethod::SizeName => {
self.common_data.stopped_search = self.check_files_size_name(stop_flag, progress_sender) == WorkContinueStatus::Stop;
if self.common_data.stopped_search {
@@ -123,6 +130,7 @@ impl DebugPrint for DuplicateFinder {
println!("Hashed files list size - {}", self.files_with_identical_hashes.len());
println!("Files with identical names - {}", self.files_with_identical_names.len());
println!("Files with identical size names - {}", self.files_with_identical_size_names.len());
+ println!("Files with fuzzy names - {}", self.files_with_fuzzy_names.len());
println!("Files with identical names referenced - {}", self.files_with_identical_names_referenced.len());
println!("Files with identical size names referenced - {}", self.files_with_identical_size_names_referenced.len());
println!("Files with identical size referenced - {}", self.files_with_identical_size_referenced.len());
@@ -178,6 +186,48 @@ impl PrintResults for DuplicateFinder {
write!(writer, "Not found any files with same names.")?;
}
}
+ CheckingMethod::FuzzyName => {
+ if !self.files_with_fuzzy_names.is_empty() {
+ writeln!(
+ writer,
+ "-------------------------------------------------Files with similar names-------------------------------------------------"
+ )?;
+ writeln!(
+ writer,
+ "Found {} files in {} groups with similar names (threshold: {:.0}%)",
+ self.information.number_of_duplicated_files_by_fuzzy_name,
+ self.information.number_of_groups_by_fuzzy_name,
+ self.params.name_similarity_threshold * 100.0,
+ )?;
+ for vector in &self.files_with_fuzzy_names {
+ writeln!(writer, "\n---- {} files", vector.len())?;
+ for j in vector {
+ writeln!(writer, "\"{}\"", j.path.to_string_lossy())?;
+ }
+ }
+ } else if !self.files_with_fuzzy_names_referenced.is_empty() {
+ writeln!(
+ writer,
+ "-------------------------------------------------Files with similar names in referenced folders-------------------------------------------------"
+ )?;
+ writeln!(
+ writer,
+ "Found {} files in {} groups with similar names (threshold: {:.0}%)",
+ self.information.number_of_duplicated_files_by_fuzzy_name,
+ self.information.number_of_groups_by_fuzzy_name,
+ self.params.name_similarity_threshold * 100.0,
+ )?;
+ for (file_entry, vector) in &self.files_with_fuzzy_names_referenced {
+ writeln!(writer, "\n---- {} files", vector.len())?;
+ writeln!(writer, "Reference file - \"{}\"", file_entry.path.to_string_lossy())?;
+ for j in vector {
+ writeln!(writer, "\"{}\"", j.path.to_string_lossy())?;
+ }
+ }
+ } else {
+ write!(writer, "Not found any files with similar names.")?;
+ }
+ }
CheckingMethod::SizeName => {
if !self.files_with_identical_names.is_empty() {
writeln!(
@@ -317,6 +367,7 @@ impl PrintResults for DuplicateFinder {
if self.get_use_reference() {
match self.get_params().check_method {
CheckingMethod::Name => self.save_results_to_file_as_json_internal(file_name, &self.files_with_identical_names_referenced, pretty_print),
+ CheckingMethod::FuzzyName => self.save_results_to_file_as_json_internal(file_name, &self.files_with_fuzzy_names_referenced, pretty_print),
CheckingMethod::SizeName => {
self.save_results_to_file_as_json_internal(file_name, &self.files_with_identical_size_names_referenced.values().collect::>(), pretty_print)
}
@@ -327,6 +378,7 @@ impl PrintResults for DuplicateFinder {
} else {
match self.get_params().check_method {
CheckingMethod::Name => self.save_results_to_file_as_json_internal(file_name, &self.files_with_identical_names, pretty_print),
+ CheckingMethod::FuzzyName => self.save_results_to_file_as_json_internal(file_name, &self.files_with_fuzzy_names, pretty_print),
CheckingMethod::SizeName => self.save_results_to_file_as_json_internal(file_name, &self.files_with_identical_size_names.values().collect::>(), pretty_print),
CheckingMethod::Size => self.save_results_to_file_as_json_internal(file_name, &self.files_with_identical_size, pretty_print),
CheckingMethod::Hash => self.save_results_to_file_as_json_internal(file_name, &self.files_with_identical_hashes, pretty_print),
@@ -358,6 +410,7 @@ impl CommonData for DuplicateFinder {
fn found_any_items(&self) -> bool {
self.get_information().number_of_duplicated_files_by_hash > 0
|| self.get_information().number_of_duplicated_files_by_name > 0
+ || self.get_information().number_of_duplicated_files_by_fuzzy_name > 0
|| self.get_information().number_of_duplicated_files_by_size > 0
|| self.get_information().number_of_duplicated_files_by_size_name > 0
}
diff --git a/czkawka_core/src/tools/exif_remover/core.rs b/czkawka_core/src/tools/exif_remover/core.rs
index 62e3cac00..bec03aefb 100644
--- a/czkawka_core/src/tools/exif_remover/core.rs
+++ b/czkawka_core/src/tools/exif_remover/core.rs
@@ -1,4 +1,4 @@
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::path::Path;
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
@@ -23,7 +23,7 @@ use crate::tools::exif_remover::{ExifEntry, ExifRemover, ExifRemoverParameters,
impl ExifRemover {
pub fn new(params: ExifRemoverParameters) -> Self {
- let mut additional_excluded_tags = BTreeMap::new();
+ let mut additional_excluded_tags = std::collections::BTreeMap::new();
let tiff_disabled_tags = vec![
"ImageWidth",
@@ -92,7 +92,7 @@ impl ExifRemover {
&mut self,
_stop_flag: &Arc,
progress_sender: Option<&Sender>,
- ) -> (BTreeMap, BTreeMap, BTreeMap) {
+ ) -> (HashMap, HashMap, HashMap) {
let progress_handler = prepare_thread_handler_common(progress_sender, CurrentStage::ExifRemoverCacheLoading, 0, self.get_test_type(), 0);
let res = load_and_split_cache_generalized_by_path(&get_exif_remover_cache_file(), mem::take(&mut self.files_to_check), self);
@@ -104,7 +104,7 @@ impl ExifRemover {
fn save_to_cache(
&mut self,
vec_file_entry: &[ExifEntry],
- loaded_hash_map: BTreeMap,
+ loaded_hash_map: HashMap,
_stop_flag: &Arc,
progress_sender: Option<&Sender>,
) {
diff --git a/czkawka_core/src/tools/exif_remover/mod.rs b/czkawka_core/src/tools/exif_remover/mod.rs
index bbd5ced41..82b0953ad 100644
--- a/czkawka_core/src/tools/exif_remover/mod.rs
+++ b/czkawka_core/src/tools/exif_remover/mod.rs
@@ -3,7 +3,7 @@ pub mod core;
mod tests;
pub mod traits;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
use std::path::PathBuf;
use std::time::Duration;
@@ -66,7 +66,7 @@ pub struct ExifRemover {
common_data: CommonToolData,
information: Info,
exif_files: Vec,
- files_to_check: BTreeMap,
+ files_to_check: HashMap,
params: ExifRemoverParameters,
additional_excluded_tags: BTreeMap<&'static str, Vec<&'static str>>,
}
diff --git a/czkawka_core/src/tools/invalid_symlinks/core.rs b/czkawka_core/src/tools/invalid_symlinks/core.rs
index b1b0bea56..41adf48bd 100644
--- a/czkawka_core/src/tools/invalid_symlinks/core.rs
+++ b/czkawka_core/src/tools/invalid_symlinks/core.rs
@@ -73,8 +73,9 @@ impl InvalidSymlinks {
current_path = match current_path.read_link() {
Ok(t) => t,
Err(_inspected) => {
- // Looks that some next symlinks are broken, but we do nothing with it - TODO why they are broken
- return None;
+ // A symlink in the chain is broken (e.g. A -> B -> missing)
+ type_of_error = ErrorType::NonExistentFile;
+ break;
}
};
diff --git a/czkawka_core/src/tools/same_music/core.rs b/czkawka_core/src/tools/same_music/core.rs
index a9e63ff1a..8f306f515 100644
--- a/czkawka_core/src/tools/same_music/core.rs
+++ b/czkawka_core/src/tools/same_music/core.rs
@@ -1,4 +1,4 @@
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
use std::fs::File;
use std::path::Path;
use std::sync::Arc;
@@ -74,12 +74,12 @@ impl SameMusic {
}
#[fun_time(message = "load_cache", level = "debug")]
- fn load_cache(&mut self, checking_tags: bool) -> (BTreeMap, BTreeMap, BTreeMap) {
+ fn load_cache(&mut self, checking_tags: bool) -> (HashMap, HashMap, HashMap) {
load_and_split_cache_generalized_by_path(&get_similar_music_cache_file(checking_tags), mem::take(&mut self.music_to_check), self)
}
#[fun_time(message = "save_cache", level = "debug")]
- fn save_cache(&mut self, vec_file_entry: &[MusicEntry], loaded_hash_map: BTreeMap, checking_tags: bool) {
+ fn save_cache(&mut self, vec_file_entry: &[MusicEntry], loaded_hash_map: HashMap, checking_tags: bool) {
save_and_connect_cache_generalized_by_path(&get_similar_music_cache_file(checking_tags), vec_file_entry, loaded_hash_map, self);
}
@@ -312,6 +312,10 @@ impl SameMusic {
progress_handler.join_thread();
+ // Sort entries within each group by path for deterministic results
+ for group in &mut old_duplicates {
+ group.sort_unstable_by(|a, b| a.path.cmp(&b.path));
+ }
self.duplicated_music_entries = old_duplicates;
if self.common_data.use_reference_folders {
@@ -426,7 +430,16 @@ impl SameMusic {
Err(e) => return Some(Err(flc!("core_error_comparing_fingerprints", reason = e.to_string()))),
};
segments.retain(|s| s.duration(configuration) > minimum_segment_duration && s.score < maximum_difference);
- if segments.is_empty() { None } else { Some(Ok((e_string, e_entry))) }
+ if segments.is_empty() {
+ None
+ } else {
+ let best_score = segments
+ .iter()
+ .map(|s| s.score)
+ .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
+ .unwrap_or(0.0);
+ Some(Ok((e_string, e_entry, best_score)))
+ }
})
.flatten()
.partition_map(|res| match res {
@@ -436,12 +449,14 @@ impl SameMusic {
self.common_data.text_messages.errors.extend(errors);
- collected_similar_items.retain(|(path, _entry)| !used_paths.contains(path));
+ collected_similar_items.retain(|(path, _entry, _score)| !used_paths.contains(path));
if !collected_similar_items.is_empty() {
let mut music_entries = Vec::new();
- for (path, entry) in collected_similar_items {
+ for (path, entry, score) in collected_similar_items {
used_paths.insert(path);
- music_entries.push(entry.clone());
+ let mut entry = entry.clone();
+ entry.similarity_score = score;
+ music_entries.push(entry);
}
used_paths.insert(f_string);
music_entries.push(f_entry);
@@ -457,6 +472,9 @@ impl SameMusic {
return WorkContinueStatus::Continue;
}
+ // Sort for deterministic grouping regardless of HashMap/cache iteration order
+ self.music_entries.sort_unstable_by(|a, b| a.path.cmp(&b.path));
+
let grouped_files_to_check = self.split_fingerprints_to_check();
let base_files_number = grouped_files_to_check.iter().map(|g| g.base_files.len()).sum::();
@@ -474,6 +492,10 @@ impl SameMusic {
progress_handler.join_thread();
+ // Sort entries within each group by path for deterministic results
+ for group in &mut duplicated_music_entries {
+ group.sort_unstable_by(|a, b| a.path.cmp(&b.path));
+ }
self.duplicated_music_entries = duplicated_music_entries;
if self.common_data.use_reference_folders {
diff --git a/czkawka_core/src/tools/same_music/mod.rs b/czkawka_core/src/tools/same_music/mod.rs
index b6f4c9898..04813dc3f 100644
--- a/czkawka_core/src/tools/same_music/mod.rs
+++ b/czkawka_core/src/tools/same_music/mod.rs
@@ -5,7 +5,7 @@ pub mod traits;
#[cfg(test)]
mod tests;
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::time::Duration;
@@ -44,6 +44,9 @@ pub struct MusicEntry {
pub length: u32,
pub genre: String,
pub bitrate: u32,
+ /// Best fingerprint match score (lower = more similar). 0 for tag-based matches.
+ #[serde(default)]
+ pub similarity_score: f64,
}
impl ResultEntry for MusicEntry {
@@ -72,6 +75,7 @@ impl FileEntry {
length: 0,
genre: String::new(),
bitrate: 0,
+ similarity_score: 0.0,
}
}
}
@@ -123,7 +127,7 @@ impl SameMusicParameters {
pub struct SameMusic {
common_data: CommonToolData,
information: Info,
- music_to_check: BTreeMap,
+ music_to_check: HashMap,
music_entries: Vec,
duplicated_music_entries: Vec>,
duplicated_music_entries_referenced: Vec<(MusicEntry, Vec)>,
diff --git a/czkawka_core/src/tools/similar_images/core.rs b/czkawka_core/src/tools/similar_images/core.rs
index 071b9c80b..ae30a3bb9 100644
--- a/czkawka_core/src/tools/similar_images/core.rs
+++ b/czkawka_core/src/tools/similar_images/core.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, BTreeSet};
+use std::collections::{BTreeSet, HashMap};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
@@ -54,7 +54,7 @@ impl SimilarImages {
.into_par_iter()
.flat_map(if self.get_hide_hard_links() { |(_, fes)| fes } else { take_1_per_inode })
.map(|fe| {
- let fe_str = fe.path.to_string_lossy().to_string();
+ let fe_str = fe.path.to_string_lossy().into_owned();
let image_entry = fe.into_images_entry();
(fe_str, image_entry)
@@ -73,7 +73,7 @@ impl SimilarImages {
}
#[fun_time(message = "hash_images_load_cache", level = "debug")]
- fn hash_images_load_cache(&mut self) -> (BTreeMap, BTreeMap, BTreeMap) {
+ fn hash_images_load_cache(&mut self) -> (HashMap, HashMap, HashMap) {
load_and_split_cache_generalized_by_path(
&get_similar_images_cache_file(self.get_params().hash_size, self.get_params().hash_alg, self.get_params().image_filter),
mem::take(&mut self.images_to_check),
@@ -82,7 +82,7 @@ impl SimilarImages {
}
#[fun_time(message = "save_to_cache", level = "debug")]
- fn save_to_cache(&mut self, vec_file_entry: &[ImagesEntry], loaded_hash_map: BTreeMap) {
+ fn save_to_cache(&mut self, vec_file_entry: &[ImagesEntry], loaded_hash_map: HashMap) {
save_and_connect_cache_generalized_by_path(
&get_similar_images_cache_file(self.get_params().hash_size, self.get_params().hash_alg, self.get_params().image_filter),
vec_file_entry,
@@ -185,24 +185,35 @@ impl SimilarImages {
.collect();
let mut base_hashes = Vec::new(); // Initial hashes
if self.common_data.use_reference_folders {
- let mut files_from_referenced_folders: IndexMap> = IndexMap::new();
- let mut normal_files: IndexMap> = IndexMap::new();
+ let mut hashes_referenced: IndexSet = IndexSet::new();
+ let mut hashes_normal: IndexSet = IndexSet::new();
- all_hashed_images.clone().into_iter().for_each(|(hash, vec_file_entry)| {
+ for (hash, vec_file_entry) in all_hashed_images {
+ let mut has_referenced = false;
+ let mut has_normal = false;
for file_entry in vec_file_entry {
- if is_in_reference_folder(&self.common_data.directories.reference_directories, &file_entry.path) {
- files_from_referenced_folders.entry(hash.clone()).or_default().push(file_entry);
+ if is_in_reference_folder(&self.common_data.directories.reference_directories, &self.common_data.directories.reference_files, &file_entry.path) {
+ has_referenced = true;
} else {
- normal_files.entry(hash.clone()).or_default().push(file_entry);
+ has_normal = true;
+ }
+ if has_referenced && has_normal {
+ break;
}
}
- });
+ if has_referenced {
+ hashes_referenced.insert(hash.clone());
+ }
+ if has_normal {
+ hashes_normal.insert(hash.clone());
+ }
+ }
- for hash in normal_files.into_keys() {
+ for hash in hashes_normal {
self.bktree.add(hash);
}
- for hash in files_from_referenced_folders.into_keys() {
+ for hash in hashes_referenced {
base_hashes.push(hash);
}
} else {
@@ -219,35 +230,37 @@ impl SimilarImages {
&self,
hashes_parents: IndexMap,
hashes_with_multiple_images: &IndexSet,
- all_hashed_images: &IndexMap>,
+ all_hashed_images: &mut IndexMap>,
collected_similar_images: &mut IndexMap>,
hashes_similarity: IndexMap,
) {
- // Collecting results to vector
+ // Collecting results to vector - use swap_remove to move data instead of cloning
for (parent_hash, child_number) in hashes_parents {
// If hash contains other hasher OR multiple images are available for checked hash
if child_number > 0 || hashes_with_multiple_images.contains(&parent_hash) {
- let vec_fe = all_hashed_images[&parent_hash].clone();
- collected_similar_images.insert(parent_hash.clone(), vec_fe);
+ if let Some(vec_fe) = all_hashed_images.swap_remove(&parent_hash) {
+ collected_similar_images.insert(parent_hash, vec_fe);
+ }
}
}
for (child_hash, (parent_hash, similarity)) in hashes_similarity {
- let mut vec_fe = all_hashed_images[&child_hash].clone();
- for fe in &mut vec_fe {
- fe.difference = similarity;
+ if let Some(mut vec_fe) = all_hashed_images.swap_remove(&child_hash) {
+ for fe in &mut vec_fe {
+ fe.difference = similarity;
+ }
+ collected_similar_images
+ .get_mut(&parent_hash)
+ .expect("Cannot find parent hash - this should be added in previous step")
+ .append(&mut vec_fe);
}
- collected_similar_images
- .get_mut(&parent_hash)
- .expect("Cannot find parent hash - this should be added in previous step")
- .append(&mut vec_fe);
}
}
#[fun_time(message = "compare_hashes_with_non_zero_tolerance", level = "debug")]
fn compare_hashes_with_non_zero_tolerance(
&mut self,
- all_hashed_images: &IndexMap>,
+ all_hashed_images: &mut IndexMap>,
collected_similar_images: &mut IndexMap>,
progress_sender: Option<&Sender>,
stop_flag: &Arc,
@@ -265,7 +278,11 @@ impl SimilarImages {
// Without chunks, every single hash would be compared to every other hash and generate really big amount of results
// With chunks we can save results to variables and later use such variables, to skip ones with too big difference
// Not really helpful, when not finding almost any duplicates, but with bigger amount of them, this should help a lot
- let base_hashes_chunks = base_hashes.chunks(1000);
+ let chunk_size = {
+ let num_cores = std::thread::available_parallelism().map_or(4, |p| p.get());
+ (base_hashes.len() / (num_cores * 4)).clamp(1000, 10000)
+ };
+ let base_hashes_chunks = base_hashes.chunks(chunk_size);
for chunk in base_hashes_chunks {
let partial_results = chunk
.into_par_iter()
@@ -282,24 +299,24 @@ impl SimilarImages {
*similarity != 0 && !hashes_parents.contains_key(*compared_hash) && !hashes_with_multiple_images.contains(*compared_hash)
})
.filter(|(similarity, compared_hash)| {
- if let Some((_, other_similarity_with_parent)) = hashes_similarity.get(*compared_hash) {
- // If current hash is more similar to other hash than to current parent hash, then skip check earlier
- // Because there is no way to be more similar to other hash than to current parent hash
- if *similarity >= *other_similarity_with_parent {
- return false;
- }
+ if let Some((_, other_similarity_with_parent)) = hashes_similarity.get(*compared_hash)
+ && *similarity >= *other_similarity_with_parent
+ {
+ return false;
}
true
})
.collect::>();
- // Sort by tolerance
+ if found_items.is_empty() && !hashes_with_multiple_images.contains(hash_to_check) {
+ return Some(None);
+ }
+
found_items.sort_unstable_by_key(|f| f.0);
- Some((hash_to_check, found_items))
+ Some(Some((hash_to_check, found_items)))
})
.while_some()
- // TODO - this filter move to into_par_iter above
- .filter(|(original_hash, vec_similar_hashes)| !vec_similar_hashes.is_empty() || hashes_with_multiple_images.contains(*original_hash))
+ .flatten()
.collect::>();
if check_if_stop_received(stop_flag) {
@@ -417,7 +434,7 @@ impl SimilarImages {
// Results
let mut collected_similar_images: IndexMap> = Default::default();
- let all_hashed_images = mem::take(&mut self.image_hashes);
+ let mut all_hashed_images = mem::take(&mut self.image_hashes);
// Checking entries with tolerance 0 is really easy and fast, because only entries with same hashes needs to be checked
if tolerance == 0 {
@@ -426,7 +443,7 @@ impl SimilarImages {
collected_similar_images.insert(hash, vec_file_entry);
}
}
- } else if self.compare_hashes_with_non_zero_tolerance(&all_hashed_images, &mut collected_similar_images, progress_sender, stop_flag, tolerance) == WorkContinueStatus::Stop
+ } else if self.compare_hashes_with_non_zero_tolerance(&mut all_hashed_images, &mut collected_similar_images, progress_sender, stop_flag, tolerance) == WorkContinueStatus::Stop
{
return WorkContinueStatus::Stop;
}
@@ -519,7 +536,7 @@ impl SimilarImages {
continue;
}
for file_entry in vec_file_entry {
- let st = file_entry.path.to_string_lossy().to_string();
+ let st = file_entry.path.to_string_lossy().into_owned();
if result_hashset.contains(&st) {
found = true;
error!("Duplicated Element {st}");
@@ -532,8 +549,8 @@ impl SimilarImages {
}
}
-fn is_in_reference_folder(reference_directories: &[PathBuf], path: &Path) -> bool {
- reference_directories.iter().any(|e| path.starts_with(e))
+fn is_in_reference_folder(reference_directories: &[PathBuf], reference_files: &[PathBuf], path: &Path) -> bool {
+ reference_directories.iter().any(|e| path.starts_with(e)) || reference_files.iter().any(|e| e.as_path() == path)
}
#[expect(clippy::indexing_slicing)] // Because hash size is validated before
@@ -644,7 +661,7 @@ fn debug_check_for_duplicated_things(
hashmap_hashes.insert((*hash).clone());
for i in &all_hashed_images[hash] {
- let name = i.path.to_string_lossy().to_string();
+ let name = i.path.to_string_lossy().into_owned();
if hashmap_names.contains(&name) {
debug!("------1--NAME--{numm} {name:?}");
found_broken_thing = true;
@@ -661,7 +678,7 @@ fn debug_check_for_duplicated_things(
hashmap_hashes.insert((*hash).clone());
for i in &all_hashed_images[hash] {
- let name = i.path.to_string_lossy().to_string();
+ let name = i.path.to_string_lossy().into_owned();
if hashmap_names.contains(&name) {
debug!("------2--NAME--{numm} {name:?}");
found_broken_thing = true;
@@ -951,7 +968,7 @@ mod tests {
similar_images.find_similar_hashes(&Arc::default(), None);
let res = similar_images.get_similar_images();
assert_eq!(res.len(), 1);
- let mut path = res[0].iter().map(|e| e.path.to_string_lossy().to_string()).collect::>();
+ let mut path = res[0].iter().map(|e| e.path.to_string_lossy().into_owned()).collect::>();
path.sort();
if res[0].len() == 3 {
assert_eq!(path, vec!["abc.txt".to_string(), "bcd.txt".to_string(), "rrd.txt".to_string()]);
diff --git a/czkawka_core/src/tools/similar_images/mod.rs b/czkawka_core/src/tools/similar_images/mod.rs
index 8f3318cb2..0189f8737 100644
--- a/czkawka_core/src/tools/similar_images/mod.rs
+++ b/czkawka_core/src/tools/similar_images/mod.rs
@@ -6,7 +6,7 @@ pub use core::return_similarity_from_similarity_preset;
#[cfg(test)]
mod tests;
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::time::Duration;
@@ -121,7 +121,7 @@ pub struct SimilarImages {
similar_referenced_vectors: Vec<(ImagesEntry, Vec)>,
// Hashmap with image hashes and Vector with names of files
image_hashes: IndexMap>,
- images_to_check: BTreeMap,
+ images_to_check: HashMap,
params: SimilarImagesParameters,
}
diff --git a/czkawka_core/src/tools/similar_videos/core.rs b/czkawka_core/src/tools/similar_videos/core.rs
index 6249752a3..16c114035 100644
--- a/czkawka_core/src/tools/similar_videos/core.rs
+++ b/czkawka_core/src/tools/similar_videos/core.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, BTreeSet};
+use std::collections::{BTreeSet, HashMap};
use std::mem;
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
@@ -258,7 +258,7 @@ impl SimilarVideos {
}
#[fun_time(message = "save_cache", level = "debug")]
- fn save_cache(&mut self, vec_file_entry: &[VideosEntry], loaded_hash_map: BTreeMap) {
+ fn save_cache(&mut self, vec_file_entry: &[VideosEntry], loaded_hash_map: HashMap) {
save_and_connect_cache_generalized_by_path(
&get_similar_videos_cache_file(self.params.skip_forward_amount, self.params.duration, self.params.crop_detect),
vec_file_entry,
@@ -268,7 +268,7 @@ impl SimilarVideos {
}
#[fun_time(message = "load_cache_at_start", level = "debug")]
- fn load_cache_at_start(&mut self) -> (BTreeMap, BTreeMap, BTreeMap) {
+ fn load_cache_at_start(&mut self) -> (HashMap, HashMap, HashMap) {
load_and_split_cache_generalized_by_path(
&get_similar_videos_cache_file(self.params.skip_forward_amount, self.params.duration, self.params.crop_detect),
mem::take(&mut self.videos_to_check),
diff --git a/czkawka_core/src/tools/similar_videos/mod.rs b/czkawka_core/src/tools/similar_videos/mod.rs
index b47b33bc8..68444043d 100644
--- a/czkawka_core/src/tools/similar_videos/mod.rs
+++ b/czkawka_core/src/tools/similar_videos/mod.rs
@@ -4,7 +4,7 @@ pub mod traits;
#[cfg(test)]
mod tests;
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::ops::RangeInclusive;
use std::path::{Path, PathBuf};
use std::time::Duration;
@@ -136,8 +136,8 @@ pub struct SimilarVideos {
information: Info,
similar_vectors: Vec>,
similar_referenced_vectors: Vec<(VideosEntry, Vec)>,
- videos_hashes: BTreeMap, Vec>,
- videos_to_check: BTreeMap,
+ videos_hashes: HashMap, Vec>,
+ videos_to_check: HashMap,
params: SimilarVideosParameters,
}
diff --git a/czkawka_core/src/tools/video_optimizer/core.rs b/czkawka_core/src/tools/video_optimizer/core.rs
index b8d1653bc..fe4d24550 100644
--- a/czkawka_core/src/tools/video_optimizer/core.rs
+++ b/czkawka_core/src/tools/video_optimizer/core.rs
@@ -1,4 +1,4 @@
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::mem;
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
@@ -349,25 +349,25 @@ impl VideoOptimizer {
fn load_video_transcode_cache(
&mut self,
) -> (
- BTreeMap,
- BTreeMap,
- BTreeMap,
+ HashMap,
+ HashMap,
+ HashMap,
) {
load_and_split_cache_generalized_by_path(&get_video_transcode_cache_file(), mem::take(&mut self.video_transcode_test_entries), self)
}
#[fun_time(message = "load_video_crop_cache", level = "debug")]
- fn load_video_crop_cache(&mut self, params: &VideoCropParams) -> (BTreeMap, BTreeMap, BTreeMap) {
+ fn load_video_crop_cache(&mut self, params: &VideoCropParams) -> (HashMap, HashMap, HashMap) {
load_and_split_cache_generalized_by_path(&get_video_crop_cache_file(params), mem::take(&mut self.video_crop_test_entries), self)
}
#[fun_time(message = "save_video_transcode_cache", level = "debug")]
- fn save_video_transcode_cache(&mut self, vec_file_entry: &[VideoTranscodeEntry], loaded_hash_map: BTreeMap) {
+ fn save_video_transcode_cache(&mut self, vec_file_entry: &[VideoTranscodeEntry], loaded_hash_map: HashMap) {
save_and_connect_cache_generalized_by_path(&get_video_transcode_cache_file(), vec_file_entry, loaded_hash_map, self);
}
#[fun_time(message = "save_video_crop_cache", level = "debug")]
- fn save_video_crop_cache(&mut self, vec_file_entry: &[VideoCropEntry], params: &VideoCropParams, loaded_hash_map: BTreeMap) {
+ fn save_video_crop_cache(&mut self, vec_file_entry: &[VideoCropEntry], params: &VideoCropParams, loaded_hash_map: HashMap) {
save_and_connect_cache_generalized_by_path(&get_video_crop_cache_file(params), vec_file_entry, loaded_hash_map, self);
}
diff --git a/czkawka_core/src/tools/video_optimizer/mod.rs b/czkawka_core/src/tools/video_optimizer/mod.rs
index ace9ddb8e..43e3b92f0 100644
--- a/czkawka_core/src/tools/video_optimizer/mod.rs
+++ b/czkawka_core/src/tools/video_optimizer/mod.rs
@@ -3,7 +3,7 @@ pub mod core;
mod tests;
pub mod traits;
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::time::Duration;
@@ -317,8 +317,8 @@ pub enum VideoOptimizerEntry {
pub struct VideoOptimizer {
common_data: CommonToolData,
information: Info,
- video_transcode_test_entries: BTreeMap,
- video_crop_test_entries: BTreeMap,
+ video_transcode_test_entries: HashMap,
+ video_crop_test_entries: HashMap,
video_transcode_result_entries: Vec,
video_crop_result_entries: Vec,
params: VideoOptimizerParameters,
diff --git a/czkawka_gui/src/connect_things/connect_settings.rs b/czkawka_gui/src/connect_things/connect_settings.rs
index 3a7e09f3c..aaa61cdf5 100644
--- a/czkawka_gui/src/connect_things/connect_settings.rs
+++ b/czkawka_gui/src/connect_things/connect_settings.rs
@@ -1,4 +1,4 @@
-use std::collections::BTreeMap;
+use std::collections::HashMap;
use std::default::Default;
use czkawka_core::common::cache::{load_cache_from_file_generalized_by_path, load_cache_from_file_generalized_by_size, save_cache_to_file_generalized};
@@ -131,7 +131,7 @@ pub(crate) fn connect_settings(gui_data: &GuiData) {
let (mut messages, loaded_items) = load_cache_from_file_generalized_by_size::(&file_name, true, &Default::default());
if let Some(cache_entries) = loaded_items {
- let mut hashmap_to_save: BTreeMap = Default::default();
+ let mut hashmap_to_save: HashMap = Default::default();
for (_, vec_file_entry) in cache_entries {
for file_entry in vec_file_entry {
hashmap_to_save.insert(file_entry.path.to_string_lossy().to_string(), file_entry);
diff --git a/czkawka_mcp/Cargo.toml b/czkawka_mcp/Cargo.toml
new file mode 100644
index 000000000..2788e5295
--- /dev/null
+++ b/czkawka_mcp/Cargo.toml
@@ -0,0 +1,29 @@
+[package]
+name = "czkawka_mcp"
+version = "11.0.1"
+authors = ["Rafał Mikrut "]
+edition = "2024"
+rust-version = "1.92.0"
+description = "MCP (Model Context Protocol) server for Czkawka - exposes file analysis tools to AI agents"
+license = "MIT"
+homepage = "https://github.com/qarmin/czkawka"
+repository = "https://github.com/qarmin/czkawka"
+
+[dependencies]
+czkawka_core = { path = "../czkawka_core", version = "11.0.1" }
+rmcp = { version = "0.1", features = ["server", "transport-io"] }
+tokio = { version = "1", features = ["macros", "rt-multi-thread", "io-std"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+schemars = "0.8"
+crossbeam-channel = "0.5"
+log = "0.4"
+
+[features]
+default = []
+heif = ["czkawka_core/heif"]
+libraw = ["czkawka_core/libraw"]
+libavif = ["czkawka_core/libavif"]
+
+[lints]
+workspace = true
diff --git a/czkawka_mcp/src/main.rs b/czkawka_mcp/src/main.rs
new file mode 100644
index 000000000..30078a1fa
--- /dev/null
+++ b/czkawka_mcp/src/main.rs
@@ -0,0 +1,633 @@
+// rmcp #[tool] macro requires &self and owned params; these clippy warnings are false positives
+#![allow(clippy::needless_pass_by_value, clippy::unused_self)]
+
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+
+use crossbeam_channel::unbounded;
+use czkawka_core::common::image::register_image_decoding_hooks;
+use czkawka_core::common::set_number_of_threads;
+use czkawka_core::common::tool_data::CommonData as _;
+use czkawka_core::common::traits::{AllTraits, PrintResults};
+use czkawka_core::tools::bad_extensions::{BadExtensions, BadExtensionsParameters};
+use czkawka_core::tools::bad_names::{BadNames, BadNamesParameters, NameIssues};
+use czkawka_core::tools::big_file::{BigFile, BigFileParameters, SearchMode};
+use czkawka_core::tools::broken_files::{BrokenFiles, BrokenFilesParameters, CheckedTypes};
+use czkawka_core::tools::duplicate::{DuplicateFinder, DuplicateFinderParameters};
+use czkawka_core::tools::empty_files::EmptyFiles;
+use czkawka_core::tools::empty_folder::EmptyFolder;
+use czkawka_core::tools::exif_remover::{ExifRemover, ExifRemoverParameters};
+use czkawka_core::tools::invalid_symlinks::InvalidSymlinks;
+use czkawka_core::tools::same_music::{SameMusic, SameMusicParameters};
+use czkawka_core::tools::similar_images::{SimilarImages, SimilarImagesParameters};
+use czkawka_core::tools::similar_videos::{SimilarVideos, SimilarVideosParameters};
+use czkawka_core::tools::temporary::Temporary;
+use czkawka_core::tools::video_optimizer::{
+ VideoCropParams, VideoCroppingMechanism, VideoOptimizer, VideoOptimizerParameters, VideoTranscodeParams,
+};
+use rmcp::model::{CallToolResult, Content, ServerCapabilities, ServerInfo};
+use rmcp::{ServerHandler, ServiceExt, tool};
+use schemars::JsonSchema;
+use serde::Deserialize;
+
+// ── Common parameter structs ──────────────────────────────────────────
+
+#[derive(Debug, Deserialize, JsonSchema)]
+struct CommonParams {
+ #[schemars(description = "List of directories to search (required)")]
+ directories: Vec,
+ #[schemars(description = "Directories to exclude from search")]
+ excluded_directories: Option>,
+ #[schemars(description = "Wildcard patterns to exclude (e.g. '*/.git', '*.tmp')")]
+ excluded_items: Option>,
+ #[schemars(description = "Only check files with these extensions (e.g. ['jpg', 'png'])")]
+ allowed_extensions: Option>,
+ #[schemars(description = "Skip files with these extensions")]
+ excluded_extensions: Option>,
+ #[schemars(description = "If true, do not recurse into subdirectories (default: false)")]
+ not_recursive: Option,
+ #[schemars(description = "Number of threads to use (0 = all available, default: 0)")]
+ thread_number: Option,
+ #[schemars(description = "Disable the cache system (default: false)")]
+ disable_cache: Option,
+}
+
+fn apply_common