Skip to content

Commit 4ab504c

Browse files
authored
Merge pull request #840 from Tanddant/Find-DuplicateFiles
🔨 - Added Samples - Find duplicate SPO files
2 parents 79ad001 + 4475471 commit 4ab504c

File tree

4 files changed

+136
-0
lines changed

4 files changed

+136
-0
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# SharePoint Online - Export Duplicate Files
2+
3+
## Summary
4+
5+
This is a simple PowerShell script that loops thorough all the files in a SharePoint Online Tenant, and compares the file Hashes of all your files, in order to identify any duplicate files, while the script does not delete them, it'll export a nice overview for you to review.
6+
7+
**NOTE** the proper way to do this would be using [Microsoft Graph Data Connect](https://learn.microsoft.com/en-us/graph/data-connect-concept-overview), but I'm cheap, and only needed it on my development tenant, so I wrote this script instead, but the concept remains the same.
8+
9+
**NOTE** I Ran this using delegated permissions, but if you want the full overview of all files, you should run this using application permissions, as delegated permissions will only return files that the user has access to.
10+
11+
# [PnP PowerShell](#tab/pnpps)
12+
13+
```powershell
14+
$ErrorActionPreference = "Stop"
15+
$SharePointRootSiteUrl = "http://<tenant>.sharepoint.com/"
16+
17+
18+
Connect-PnPOnline -Interactive -Url $SharePointRootSiteUrl -ClientId "<ClientId>";
19+
20+
$allFiles = New-Object System.Collections.ArrayList;
21+
22+
23+
$sites = Invoke-PnPGraphMethod -Url "https://graph.microsoft.com/v1.0/sites/?`$search=`"http*`"&`$select=id,webUrl,displayName&`$top=100" -All;
24+
25+
foreach ($site in $sites.value) {
26+
Write-Host "> Site: $($site.displayName) - ($($site.webUrl))"
27+
$drives = Invoke-PnPGraphMethod -Url "https://graph.microsoft.com/v1.0/sites/$($site.id)/drives?`$select=id,webUrl,name&`$top=100" -All;
28+
29+
foreach ($drive in $drives.value) {
30+
Write-Host "`t> Drive: $($drive.name) - ($($drive.webUrl))";
31+
32+
## Would've loved to use a $select=file,id,webUrl,size,name but that breaks for some reason when using PnP PowerShell
33+
$files = Invoke-PnPGraphMethod -Url "https://graph.microsoft.com/v1.0/sites/$($site.id)/drives/$($drive.id)/items?`$filter=file ne null" -All;
34+
35+
foreach ($file in $files.value | Where-Object { $_.file -ne $null }) {
36+
Write-Host "`t`t>File: $($file.name)";
37+
38+
$allFiles.Add([PSCustomObject]@{
39+
SiteId = $site.id
40+
DriveId = $drive.id
41+
FileId = $file.id
42+
FileName = $file.name
43+
FileWebUrl = $file.webUrl
44+
FileSize = $file.size
45+
FileHash = $file.file.hashes.quickXorHash
46+
}) | Out-Null
47+
}
48+
Write-Host "`t> Finished processing files in drive: $($drive.name)"
49+
}
50+
Write-Host "> Finished processing drives in site: $($site.displayName)"
51+
}
52+
53+
Write-Host "Finished loading all files"
54+
55+
$grouped = $allFiles | Where-Object {$null -ne $_.FileHash} | Group-Object -Property FileHash | Where-Object { $_.Count -gt 1 } | Sort-Object -Property Count -Descending;
56+
57+
foreach($group in $grouped){
58+
Write-Host "Duplicate files with hash: $($group.Name)"
59+
foreach($file in $group.Group){
60+
Write-Host "`t> $($file.FileName) - $($file.FileWebUrl)"
61+
}
62+
Write-Host ""
63+
}
64+
65+
66+
```
67+
68+
***
69+
70+
## Contributors
71+
72+
| Author(s) |
73+
| ------------------------------- |
74+
| [Dan Toft](https://Dan-toft.dk) |
75+
76+
77+
[!INCLUDE [DISCLAIMER](../../docfx/includes/DISCLAIMER.md)]
78+
<img src="https://m365-visitor-stats.azurewebsites.net/script-samples/scripts/template-script-submission" aria-hidden="true" />
Loading
Loading
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
[
2+
{
3+
"name": "spo-export-duplicate-files",
4+
"source": "pnp",
5+
"title": "Export duplicate files from SharePoint Online",
6+
"shortDescription": "Export duplicate files from SharePoint Online document libraries.",
7+
"url": "https://pnp.github.io/script-samples/spo-export-duplicate-files/README.html",
8+
"longDescription": [
9+
""
10+
],
11+
"creationDateTime": "2025-05-25",
12+
"updateDateTime": "2025-05-25",
13+
"products": [
14+
"SharePoint",
15+
"Graph"
16+
],
17+
"metadata": [
18+
{
19+
"key": "PNP-POWERSHELL",
20+
"value": "3.1.0"
21+
},
22+
{
23+
"key": "POWERSHELL",
24+
"value": "7.4.6"
25+
}
26+
],
27+
"categories": [
28+
"Data",
29+
"Report"
30+
],
31+
"tags": [
32+
"<Cmdlets-Used>"
33+
],
34+
"thumbnails": [
35+
{
36+
"type": "image",
37+
"order": 100,
38+
"url": "https://raw.githubusercontent.com/pnp/script-samples/main/scripts/spo-export-duplicate-files/assets/preview.png",
39+
"alt": "Preview of the sample Export duplicate files from SharePoint Online"
40+
}
41+
],
42+
"authors": [
43+
{
44+
"gitHubAccount": "Tanddant",
45+
"company": "",
46+
"pictureUrl": "https://github.com/Tanddant.png",
47+
"name": "Dan Toft"
48+
}
49+
],
50+
"references": [
51+
{
52+
"name": "Want to learn more about PnP PowerShell and the cmdlets",
53+
"description": "Check out the PnP PowerShell site to get started and for the reference to the cmdlets.",
54+
"url": "https://aka.ms/pnp/powershell"
55+
}
56+
]
57+
}
58+
]

0 commit comments

Comments
 (0)