Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/hardlink/aware.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,9 @@ where
}));

let ino = InodeNumber::get(stats);
let dev = stats.dev();
self.record
.add(ino, size, links, path)
.add(ino, dev, size, links, path)
.map_err(ReportHardlinksError::AddToRecord)
}
}
Expand Down
22 changes: 19 additions & 3 deletions src/hardlink/hardlink_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,20 @@ use pipe_trait::Pipe;
#[cfg(any(unix, test))]
use std::path::Path;

/// Internal key used to uniquely identify an inode across all filesystems.
///
/// Hardlinks cannot span filesystems, so including the device number prevents
/// false deduplication of files from different filesystems that happen to share
/// the same inode number. Both du-dust and dua-cli track `(device, inode)` pairs
/// for the same reason.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
struct InodeKey {
/// Device number of the filesystem the inode belongs to.
dev: u64,
/// Inode number within the device.
ino: InodeNumber,
}

/// Map value in [`HardlinkList`].
#[derive(Debug, Clone)]
struct Value<Size> {
Expand All @@ -38,8 +52,8 @@ struct Value<Size> {
/// [`Reflection`] which implement these traits.
#[derive(Debug, SmartDefault, Clone)]
pub struct HardlinkList<Size>(
/// Map an inode number to its size, number of links, and detected paths.
DashMap<InodeNumber, Value<Size>>,
/// Map an inode key (device + inode number) to its size, number of links, and detected paths.
DashMap<InodeKey, Value<Size>>,
);

impl<Size> HardlinkList<Size> {
Expand Down Expand Up @@ -112,13 +126,15 @@ where
pub(crate) fn add(
&self,
ino: InodeNumber,
dev: u64,
size: Size,
links: u64,
path: &Path,
) -> Result<(), AddError<Size>> {
let key = InodeKey { dev, ino };
let mut assertions = Ok(());
self.0
.entry(ino)
.entry(key)
.and_modify(|recorded| {
if size != recorded.size {
assertions = Err(AddError::SizeConflict(SizeConflictError {
Expand Down
8 changes: 4 additions & 4 deletions src/hardlink/hardlink_list/iter.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::{HardlinkList, Value};
use super::{HardlinkList, InodeKey, Value};
use crate::{hardlink::LinkPathList, inode::InodeNumber};
use dashmap::{iter::Iter as DashIter, mapref::multiple::RefMulti};
use pipe_trait::Pipe;
Expand All @@ -7,7 +7,7 @@ use pipe_trait::Pipe;
#[derive(derive_more::Debug)]
#[debug(bound())]
#[debug("Iter(..)")]
pub struct Iter<'a, Size>(DashIter<'a, InodeNumber, Value<Size>>);
pub struct Iter<'a, Size>(DashIter<'a, InodeKey, Value<Size>>);

impl<Size> HardlinkList<Size> {
/// Iterate over the recorded entries.
Expand All @@ -20,7 +20,7 @@ impl<Size> HardlinkList<Size> {
#[derive(derive_more::Debug)]
#[debug(bound())]
#[debug("Item(..)")]
pub struct Item<'a, Size>(RefMulti<'a, InodeNumber, Value<Size>>);
pub struct Item<'a, Size>(RefMulti<'a, InodeKey, Value<Size>>);

impl<'a, Size> Iterator for Iter<'a, Size> {
type Item = Item<'a, Size>;
Expand All @@ -33,7 +33,7 @@ impl<'a, Size> Item<'a, Size> {
/// The inode number of the file.
#[inline]
pub fn ino(&self) -> InodeNumber {
*self.0.key()
self.0.key().ino
}

/// Size of the file.
Expand Down
35 changes: 28 additions & 7 deletions src/hardlink/hardlink_list/reflection.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::{HardlinkList, Value};
use super::{HardlinkList, InodeKey, Value};
use crate::{hardlink::LinkPathListReflection, inode::InodeNumber};
use dashmap::DashMap;
use derive_more::{Display, Error, Into, IntoIterator};
Expand All @@ -12,8 +12,14 @@ use serde::{Deserialize, Serialize};
/// internal content.
///
/// **Guarantees:**
/// * Every inode number is unique.
/// * The internal list is always sorted by inode numbers.
/// * Every `(device, inode)` pair is unique within the scope of a single scan, but inode
/// numbers alone are **not** guaranteed to be unique: when scanning multiple filesystems,
/// two unrelated files on different devices can share the same inode number and will each
/// produce a separate entry. The reflection stores only the inode number (the JSON format
/// does not carry device information), so round-tripping a multi-filesystem scan through
/// JSON is an unsupported edge case.
/// * The internal list is always sorted by inode numbers (and by device number as a
/// tie-breaker when two entries share the same inode number).
///
/// **Equality:** `Reflection` implements `PartialEq` and `Eq` traits.
///
Expand Down Expand Up @@ -95,10 +101,17 @@ impl<Size> From<Vec<ReflectionEntry<Size>>> for Reflection<Size> {

impl<Size> From<HardlinkList<Size>> for Reflection<Size> {
fn from(HardlinkList(list): HardlinkList<Size>) -> Self {
list.into_iter()
.map(|(ino, value)| ReflectionEntry::new(ino, value))
// Collect to a vec, sort by (ino, dev) for a stable, deterministic order, then
// strip dev before wrapping. Sorting here (with dev still available) avoids the
// nondeterminism that would arise from an unstable sort on ino alone when two
// entries from different filesystems share the same inode number.
let mut pairs: Vec<(InodeKey, Value<Size>)> = list.into_iter().collect();
pairs.sort_unstable_by_key(|(key, _)| (u64::from(key.ino), key.dev));
pairs
.into_iter()
.map(|(key, value)| ReflectionEntry::new(key.ino, value))
.collect::<Vec<_>>()
.pipe(Reflection::from)
.pipe(Reflection)
}
}

Expand All @@ -119,7 +132,15 @@ impl<Size> TryFrom<Reflection<Size>> for HardlinkList<Size> {

for entry in entries {
let (ino, value) = entry.dissolve();
if map.insert(ino, value).is_some() {
// Device number is unknown when loading from a reflection (e.g. JSON input);
// use dev=0 as a placeholder. This means that when reloading JSON output that
// was produced by scanning multiple filesystems, files from different devices
// sharing the same inode number cannot be distinguished and therefore cannot
// all be represented. Such duplicates cause a ConversionError::DuplicatedInode
// and are treated as an unsupported edge case, since the JSON format does not
// carry device information.
let key = InodeKey { dev: 0, ino };
if map.insert(key, value).is_some() {
return ino.pipe(ConversionError::DuplicatedInode).pipe(Err);
}
}
Expand Down
66 changes: 51 additions & 15 deletions src/hardlink/hardlink_list/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,22 @@ use crate::size::Bytes;
use pipe_trait::Pipe;
use pretty_assertions::{assert_eq, assert_ne};

const TABLE: &[(u64, u64, u64, &str)] = &[
(241, 3652, 1, "a"),
(569, 2210, 1, "b"),
(110, 2350, 3, "c"),
(110, 2350, 3, "c1"),
(778, 1110, 1, "d"),
(274, 6060, 2, "e"),
(274, 6060, 2, "e1"),
(883, 4530, 1, "f"),
const TABLE: &[(u64, u64, u64, u64, &str)] = &[
// dev, ino, size, links, path
(0, 241, 3652, 1, "a"),
(0, 569, 2210, 1, "b"),
(0, 110, 2350, 3, "c"),
(0, 110, 2350, 3, "c1"),
(0, 778, 1110, 1, "d"),
(0, 274, 6060, 2, "e"),
(0, 274, 6060, 2, "e1"),
(0, 883, 4530, 1, "f"),
];

fn add<const ROW: usize>(list: HardlinkList<Bytes>) -> HardlinkList<Bytes> {
let values = TABLE[ROW];
let (ino, size, links, path) = values;
if let Err(error) = list.add(ino.into(), size.into(), links, path.as_ref()) {
let (dev, ino, size, links, path) = values;
if let Err(error) = list.add(ino.into(), dev, size.into(), links, path.as_ref()) {
panic!("Failed to add {values:?} (index: {ROW}) to the list: {error}");
}
list
Expand Down Expand Up @@ -119,10 +120,10 @@ fn insertion_difference_cause_inequality() {
#[test]
fn detect_size_change() {
let list = HardlinkList::<Bytes>::new();
list.add(123.into(), 100.into(), 1, "a".as_ref())
list.add(123.into(), 0, 100.into(), 1, "a".as_ref())
.expect("add the first path");
let actual = list
.add(123.into(), 110.into(), 1, "b".as_ref())
.add(123.into(), 0, 110.into(), 1, "b".as_ref())
.expect_err("add the second path");
let expected = AddError::SizeConflict(SizeConflictError {
ino: 123.into(),
Expand All @@ -135,10 +136,10 @@ fn detect_size_change() {
#[test]
fn detect_number_of_links_change() {
let list = HardlinkList::<Bytes>::new();
list.add(123.into(), 100.into(), 1, "a".as_ref())
list.add(123.into(), 0, 100.into(), 1, "a".as_ref())
.expect("add the first path");
let actual = list
.add(123.into(), 100.into(), 2, "b".as_ref())
.add(123.into(), 0, 100.into(), 2, "b".as_ref())
.expect_err("add the second path");
let expected = AddError::NumberOfLinksConflict(NumberOfLinksConflictError {
ino: 123.into(),
Expand All @@ -147,3 +148,38 @@ fn detect_number_of_links_change() {
});
assert_eq!(actual, expected);
}

/// Files on different devices may share the same inode number, but they are
/// unrelated — hardlinks cannot span filesystem boundaries. Verify that two
/// files with the same inode number but different device numbers produce
/// separate entries in the list (i.e. the device number is actually used in
/// the deduplication key).
#[test]
fn same_ino_on_different_devices_are_treated_separately() {
let list = HardlinkList::<Bytes>::new();

// dev=1, ino=100 — first filesystem
list.add(100.into(), 1, 50.into(), 2, "dev1/file_a".as_ref())
.expect("add dev1/file_a");
list.add(100.into(), 1, 50.into(), 2, "dev1/file_b".as_ref())
.expect("add dev1/file_b (same dev+ino → same inode group)");

// dev=2, ino=100 — second filesystem, coincidentally same inode number
list.add(100.into(), 2, 80.into(), 2, "dev2/file_c".as_ref())
.expect("add dev2/file_c (different dev → separate inode group)");
list.add(100.into(), 2, 80.into(), 2, "dev2/file_d".as_ref())
.expect("add dev2/file_d (same dev+ino → same inode group as file_c)");

// Each device should produce its own entry, so the list should have 2 entries.
assert_eq!(list.len(), 2, "expected one entry per (dev, ino) pair");

let reflection = list.into_reflection();
// Both entries expose ino=100 in the reflection (device is not part of the
// public JSON format), so there are still 2 entries in the vector.
assert_eq!(reflection.len(), 2);

// Paths are grouped per (dev, ino): each group has exactly 2 paths.
for entry in reflection.iter() {
assert_eq!(entry.paths.len(), 2);
}
}
Loading