diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index faeeb0ee2b..d6f27f3cac 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -112,6 +112,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/memxattr", "//pkg/sync", "//pkg/syserr", "//pkg/unet", diff --git a/pkg/sentry/fsimpl/gofer/dentry_impl.go b/pkg/sentry/fsimpl/gofer/dentry_impl.go index 2fe86e3380..1d4177c40e 100644 --- a/pkg/sentry/fsimpl/gofer/dentry_impl.go +++ b/pkg/sentry/fsimpl/gofer/dentry_impl.go @@ -44,16 +44,16 @@ import ( // analysis to proceed as usual and avoids heap allocations. // // Also note that the default case in these type switch statements panics. We -// do not do panic(fmt.Sprintf("... %T", d.impl)) because somehow it adds a lot +// do not do panic(fmt.Sprintf("... %T", d.inode.impl)) because somehow it adds a lot // of overhead to the type switch. So instead we panic with a constant string. -// Precondition: d.handleMu must be locked. +// Precondition: d.inode.handleMu must be locked. func (d *dentry) isReadHandleOk() bool { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.readFDLisa.Ok() case *directfsDentry: - return d.readFD.RacyLoad() >= 0 + return d.inode.readFD.RacyLoad() >= 0 case nil: // synthetic dentry return false default: @@ -61,13 +61,13 @@ func (d *dentry) isReadHandleOk() bool { } } -// Precondition: d.handleMu must be locked. +// Precondition: d.inode.handleMu must be locked. func (d *dentry) isWriteHandleOk() bool { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.writeFDLisa.Ok() case *directfsDentry: - return d.writeFD.RacyLoad() >= 0 + return d.inode.writeFD.RacyLoad() >= 0 case nil: // synthetic dentry return false default: @@ -75,16 +75,16 @@ func (d *dentry) isWriteHandleOk() bool { } } -// Precondition: d.handleMu must be locked. +// Precondition: d.inode.handleMu must be locked. func (d *dentry) readHandle() handle { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return handle{ fdLisa: dt.readFDLisa, - fd: d.readFD.RacyLoad(), + fd: d.inode.readFD.RacyLoad(), } case *directfsDentry: - return handle{fd: d.readFD.RacyLoad()} + return handle{fd: d.inode.readFD.RacyLoad()} case nil: // synthetic dentry return noHandle default: @@ -92,16 +92,16 @@ func (d *dentry) readHandle() handle { } } -// Precondition: d.handleMu must be locked. +// Precondition: d.inode.handleMu must be locked. func (d *dentry) writeHandle() handle { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return handle{ fdLisa: dt.writeFDLisa, - fd: d.writeFD.RacyLoad(), + fd: d.inode.writeFD.RacyLoad(), } case *directfsDentry: - return handle{fd: d.writeFD.RacyLoad()} + return handle{fd: d.inode.writeFD.RacyLoad()} case nil: // synthetic dentry return noHandle default: @@ -127,7 +127,7 @@ func (d *dentry) openHandle(ctx context.Context, read, write, trunc bool) (handl if trunc { flags |= unix.O_TRUNC } - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.openHandle(ctx, flags) case *directfsDentry: @@ -138,10 +138,10 @@ func (d *dentry) openHandle(ctx context.Context, read, write, trunc bool) (handl } // Preconditions: -// - d.handleMu must be locked. +// - d.inode.handleMu must be locked. // - !d.isSynthetic(). func (d *dentry) updateHandles(ctx context.Context, h handle, readable, writable bool) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: dt.updateHandles(ctx, h, readable, writable) case *directfsDentry: @@ -152,21 +152,21 @@ func (d *dentry) updateHandles(ctx context.Context, h handle, readable, writable } // Preconditions: -// - d.handleMu must be locked. +// - d.inode.handleMu must be locked. // - !d.isSynthetic(). func (d *dentry) closeHostFDs() { - // We can use RacyLoad() because d.handleMu is locked. - if d.readFD.RacyLoad() >= 0 { - _ = unix.Close(int(d.readFD.RacyLoad())) + // We can use RacyLoad() because d.inode.handleMu is locked. + if d.inode.readFD.RacyLoad() >= 0 { + _ = unix.Close(int(d.inode.readFD.RacyLoad())) } - if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { - _ = unix.Close(int(d.writeFD.RacyLoad())) + if d.inode.writeFD.RacyLoad() >= 0 && d.inode.readFD.RacyLoad() != d.inode.writeFD.RacyLoad() { + _ = unix.Close(int(d.inode.writeFD.RacyLoad())) } - d.readFD = atomicbitops.FromInt32(-1) - d.writeFD = atomicbitops.FromInt32(-1) - d.mmapFD = atomicbitops.FromInt32(-1) + d.inode.readFD = atomicbitops.FromInt32(-1) + d.inode.writeFD = atomicbitops.FromInt32(-1) + d.inode.mmapFD = atomicbitops.FromInt32(-1) - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *directfsDentry: if dt.controlFD >= 0 { _ = unix.Close(dt.controlFD) @@ -181,14 +181,14 @@ func (d *dentry) closeHostFDs() { // // Preconditions: // - !d.isSynthetic(). -// - d.metadataMu is locked. +// - d.inode.metadataMu is locked. // -// +checklocks:d.metadataMu +// +checklocks:d.inode.metadataMu func (d *dentry) updateMetadataLocked(ctx context.Context, h handle) error { // Need checklocksforce below because checklocks has no way of knowing that - // d.impl.(*dentryImpl).dentry == d. It can't know that the right metadataMu + // d.inode.impl.(*dentryImpl).dentry == d. It can't know that the right metadataMu // is already locked. - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.updateMetadataLocked(ctx, h) // +checklocksforce: acquired by precondition. case *directfsDentry: @@ -202,7 +202,7 @@ func (d *dentry) updateMetadataLocked(ctx context.Context, h handle) error { // - !d.isSynthetic(). // - fs.renameMu is locked. func (d *dentry) prepareSetStat(ctx context.Context, stat *linux.Statx) error { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: // Nothing to be done. return nil @@ -215,7 +215,7 @@ func (d *dentry) prepareSetStat(ctx context.Context, stat *linux.Statx) error { // Precondition: fs.renameMu is locked if d is a socket. func (d *dentry) chmod(ctx context.Context, mode uint16) error { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return chmod(ctx, dt.controlFD, mode) case *directfsDentry: @@ -227,10 +227,10 @@ func (d *dentry) chmod(ctx context.Context, mode uint16) error { // Preconditions: // - !d.isSynthetic(). -// - d.handleMu is locked. +// - d.inode.handleMu is locked. // - fs.renameMu is locked. func (d *dentry) setStatLocked(ctx context.Context, stat *linux.Statx) (uint32, error, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.controlFD.SetStat(ctx, stat) case *directfsDentry: @@ -241,9 +241,9 @@ func (d *dentry) setStatLocked(ctx context.Context, stat *linux.Statx) (uint32, } } -// Precondition: d.handleMu must be locked. +// Precondition: d.inode.handleMu must be locked. func (d *dentry) destroyImpl(ctx context.Context) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: dt.destroy(ctx) case *directfsDentry: @@ -258,7 +258,7 @@ func (d *dentry) destroyImpl(ctx context.Context) { // // +checklocksread:d.opMu func (d *dentry) getRemoteChild(ctx context.Context, name string) (*dentry, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.getRemoteChild(ctx, name) case *directfsDentry: @@ -278,13 +278,13 @@ func (d *dentry) getRemoteChild(ctx context.Context, name string) (*dentry, erro // // +checklocksread:d.opMu func (d *dentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp resolvingPath, ds **[]*dentry) (*dentry, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.getRemoteChildAndWalkPathLocked(ctx, rp, ds) case *directfsDentry: // We need to check for races because opMu is read locked which allows // concurrent walks to occur. - return d.fs.getRemoteChildLocked(ctx, d, rp.Component(), true /* checkForRace */, ds) + return d.inode.fs.getRemoteChildLocked(ctx, d, rp.Component(), true /* checkForRace */, ds) default: panic("unknown dentry implementation") } @@ -292,7 +292,7 @@ func (d *dentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp resolvi // Precondition: !d.isSynthetic(). func (d *dentry) listXattrImpl(ctx context.Context, size uint64) ([]string, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.controlFD.ListXattr(ctx, size) case *directfsDentry: @@ -305,7 +305,7 @@ func (d *dentry) listXattrImpl(ctx context.Context, size uint64) ([]string, erro // Precondition: !d.isSynthetic(). func (d *dentry) getXattrImpl(ctx context.Context, opts *vfs.GetXattrOptions) (string, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.controlFD.GetXattr(ctx, opts.Name, opts.Size) case *directfsDentry: @@ -317,7 +317,7 @@ func (d *dentry) getXattrImpl(ctx context.Context, opts *vfs.GetXattrOptions) (s // Precondition: !d.isSynthetic(). func (d *dentry) setXattrImpl(ctx context.Context, opts *vfs.SetXattrOptions) error { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.controlFD.SetXattr(ctx, opts.Name, opts.Value, opts.Flags) case *directfsDentry: @@ -330,7 +330,7 @@ func (d *dentry) setXattrImpl(ctx context.Context, opts *vfs.SetXattrOptions) er // Precondition: !d.isSynthetic(). func (d *dentry) removeXattrImpl(ctx context.Context, name string) error { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.controlFD.RemoveXattr(ctx, name) case *directfsDentry: @@ -343,7 +343,7 @@ func (d *dentry) removeXattrImpl(ctx context.Context, name string) error { // Precondition: !d.isSynthetic(). func (d *dentry) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.mknod(ctx, name, creds, opts) case *directfsDentry: @@ -356,13 +356,13 @@ func (d *dentry) mknod(ctx context.Context, name string, creds *auth.Credentials // Preconditions: // - !d.isSynthetic(). // - !target.isSynthetic(). -// - d.fs.renameMu must be locked. +// - d.inode.fs.renameMu must be locked. func (d *dentry) link(ctx context.Context, target *dentry, name string) (*dentry, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: - return dt.link(ctx, target.impl.(*lisafsDentry), name) + return dt.link(ctx, target.inode.impl.(*lisafsDentry), name) case *directfsDentry: - return dt.link(target.impl.(*directfsDentry), name) + return dt.link(target.inode.impl.(*directfsDentry), name) default: panic("unknown dentry implementation") } @@ -370,7 +370,7 @@ func (d *dentry) link(ctx context.Context, target *dentry, name string) (*dentry // Precondition: !d.isSynthetic(). func (d *dentry) mkdir(ctx context.Context, name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.mkdir(ctx, name, mode, uid, gid, createDentry) case *directfsDentry: @@ -382,7 +382,7 @@ func (d *dentry) mkdir(ctx context.Context, name string, mode linux.FileMode, ui // Precondition: !d.isSynthetic(). func (d *dentry) symlink(ctx context.Context, name, target string, creds *auth.Credentials) (*dentry, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.symlink(ctx, name, target, creds) case *directfsDentry: @@ -394,7 +394,7 @@ func (d *dentry) symlink(ctx context.Context, name, target string, creds *auth.C // Precondition: !d.isSynthetic(). func (d *dentry) openCreate(ctx context.Context, name string, accessFlags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID, createDentry bool) (*dentry, handle, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.openCreate(ctx, name, accessFlags, mode, uid, gid, createDentry) case *directfsDentry: @@ -406,10 +406,10 @@ func (d *dentry) openCreate(ctx context.Context, name string, accessFlags uint32 // Preconditions: // - d.isDir(). -// - d.handleMu must be locked. +// - d.inode.handleMu must be locked. // - !d.isSynthetic(). func (d *dentry) getDirentsLocked(ctx context.Context, recordDirent func(name string, key inoKey, dType uint8)) error { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.getDirentsLocked(ctx, recordDirent) case *directfsDentry: @@ -421,9 +421,9 @@ func (d *dentry) getDirentsLocked(ctx context.Context, recordDirent func(name st // Precondition: !d.isSynthetic(). func (d *dentry) flush(ctx context.Context) error { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - switch dt := d.impl.(type) { + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() + switch dt := d.inode.impl.(type) { case *lisafsDentry: return flush(ctx, dt.writeFDLisa) case *directfsDentry: @@ -436,13 +436,13 @@ func (d *dentry) flush(ctx context.Context) error { // Precondition: !d.isSynthetic(). func (d *dentry) allocate(ctx context.Context, mode, offset, length uint64) error { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - switch dt := d.impl.(type) { + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.writeFDLisa.Allocate(ctx, mode, offset, length) case *directfsDentry: - return unix.Fallocate(int(d.writeFD.RacyLoad()), uint32(mode), int64(offset), int64(length)) + return unix.Fallocate(int(d.inode.writeFD.RacyLoad()), uint32(mode), int64(offset), int64(length)) default: panic("unknown dentry implementation") } @@ -459,7 +459,7 @@ func (d *dentry) connect(ctx context.Context, sockType linux.SockType) (int, err euid = lisafs.UID(creds.EffectiveKUID) egid = lisafs.GID(creds.EffectiveKGID) } - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.controlFD.Connect(ctx, sockType, euid, egid) case *directfsDentry: @@ -471,7 +471,7 @@ func (d *dentry) connect(ctx context.Context, sockType linux.SockType) (int, err // Precondition: !d.isSynthetic(). func (d *dentry) readlinkImpl(ctx context.Context) (string, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.controlFD.ReadLinkAt(ctx) case *directfsDentry: @@ -483,7 +483,7 @@ func (d *dentry) readlinkImpl(ctx context.Context) (string, error) { // Precondition: !d.isSynthetic(). func (d *dentry) unlink(ctx context.Context, name string, flags uint32) error { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.controlFD.UnlinkAt(ctx, name, flags) case *directfsDentry: @@ -495,11 +495,11 @@ func (d *dentry) unlink(ctx context.Context, name string, flags uint32) error { // Precondition: !d.isSynthetic(). func (d *dentry) rename(ctx context.Context, oldName string, newParent *dentry, newName string) error { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: - return dt.controlFD.RenameAt(ctx, oldName, newParent.impl.(*lisafsDentry).controlFD.ID(), newName) + return dt.controlFD.RenameAt(ctx, oldName, newParent.inode.impl.(*lisafsDentry).controlFD.ID(), newName) case *directfsDentry: - return fsutil.RenameAt(dt.controlFD, oldName, newParent.impl.(*directfsDentry).controlFD, newName) + return fsutil.RenameAt(dt.controlFD, oldName, newParent.inode.impl.(*directfsDentry).controlFD, newName) default: panic("unknown dentry implementation") } @@ -507,7 +507,7 @@ func (d *dentry) rename(ctx context.Context, oldName string, newParent *dentry, // Precondition: !d.isSynthetic(). func (d *dentry) statfs(ctx context.Context) (linux.Statfs, error) { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: return dt.statfs(ctx) case *directfsDentry: @@ -524,7 +524,7 @@ func (fs *filesystem) restoreRoot(ctx context.Context, opts *vfs.CompleteRestore } // The root is always non-synthetic. - switch dt := fs.root.impl.(type) { + switch dt := fs.root.inode.impl.(type) { case *lisafsDentry: return dt.restoreFile(ctx, &rootInode, opts) case *directfsDentry: @@ -539,39 +539,39 @@ func (fs *filesystem) restoreRoot(ctx context.Context, opts *vfs.CompleteRestore // - !d.isSynthetic(). // - d.parent != nil and has been restored. func (d *dentry) restoreFile(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { - switch dt := d.impl.(type) { + switch dt := d.inode.impl.(type) { case *lisafsDentry: - controlFD := d.parent.Load().impl.(*lisafsDentry).controlFD + controlFD := d.parent.Load().inode.impl.(*lisafsDentry).controlFD inode, err := controlFD.Walk(ctx, d.name) if err != nil { if !dt.isDir() || !dt.forMountpoint { - return fmt.Errorf("failed to walk %q of type %x: %w", genericDebugPathname(d.fs, d), dt.fileType(), err) + return fmt.Errorf("failed to walk %q of type %x: %w", genericDebugPathname(d.inode.fs, d), dt.fileType(), err) } // Recreate directories that were created during volume mounting, since // during restore we don't attempt to remount them. - inode, err = controlFD.MkdirAt(ctx, d.name, linux.FileMode(d.mode.Load()), lisafs.UID(d.uid.Load()), lisafs.GID(d.gid.Load())) + inode, err = controlFD.MkdirAt(ctx, d.name, linux.FileMode(d.inode.mode.Load()), lisafs.UID(d.inode.uid.Load()), lisafs.GID(d.inode.gid.Load())) if err != nil { - return fmt.Errorf("failed to create mountpoint directory at %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to create mountpoint directory at %q: %w", genericDebugPathname(d.inode.fs, d), err) } } return dt.restoreFile(ctx, &inode, opts) case *directfsDentry: - controlFD := d.parent.Load().impl.(*directfsDentry).controlFD + controlFD := d.parent.Load().inode.impl.(*directfsDentry).controlFD childFD, err := tryOpen(func(flags int) (int, error) { n, err := unix.Openat(controlFD, d.name, flags, 0) return n, err }) if err != nil { if !dt.isDir() || !dt.forMountpoint { - return fmt.Errorf("failed to walk %q of type %x: %w", genericDebugPathname(d.fs, d), dt.fileType(), err) + return fmt.Errorf("failed to walk %q of type %x: %w", genericDebugPathname(d.inode.fs, d), dt.fileType(), err) } // Recreate directories that were created during volume mounting, since // during restore we don't attempt to remount them. - if err := unix.Mkdirat(controlFD, d.name, d.mode.Load()); err != nil { - return fmt.Errorf("failed to create mountpoint directory at %q: %w", genericDebugPathname(d.fs, d), err) + if err := unix.Mkdirat(controlFD, d.name, d.inode.mode.Load()); err != nil { + return fmt.Errorf("failed to create mountpoint directory at %q: %w", genericDebugPathname(d.inode.fs, d), err) } // Try again... @@ -579,7 +579,7 @@ func (d *dentry) restoreFile(ctx context.Context, opts *vfs.CompleteRestoreOptio return unix.Openat(controlFD, d.name, flags, 0) }) if err != nil { - return fmt.Errorf("failed to open %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to open %q: %w", genericDebugPathname(d.inode.fs, d), err) } } return dt.restoreFile(ctx, childFD, opts) @@ -601,7 +601,7 @@ func (r *revalidateState) doRevalidation(ctx context.Context, vfsObj *vfs.Virtua if r.start.isSynthetic() { return nil } - switch r.start.impl.(type) { + switch r.start.inode.impl.(type) { case *lisafsDentry: return doRevalidationLisafs(ctx, vfsObj, r, ds) case *directfsDentry: diff --git a/pkg/sentry/fsimpl/gofer/directfs_dentry.go b/pkg/sentry/fsimpl/gofer/directfs_dentry.go index bdec2f196f..6a2252124c 100644 --- a/pkg/sentry/fsimpl/gofer/directfs_dentry.go +++ b/pkg/sentry/fsimpl/gofer/directfs_dentry.go @@ -22,7 +22,6 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fsutil" "gvisor.dev/gvisor/pkg/lisafs" @@ -77,7 +76,7 @@ func (fs *filesystem) getDirectfsRootDentry(ctx context.Context, rootHostFD int, rootControlFD.Close(ctx, false /* flush */) return nil, err } - d.impl.(*directfsDentry).controlFDLisa = rootControlFD + d.inode.impl.(*directfsDentry).controlFDLisa = rootControlFD return d, nil } @@ -120,23 +119,14 @@ func (fs *filesystem) newDirectfsDentry(controlFD int) (*dentry, error) { return nil, err } inoKey := inoKeyFromStat(&stat) + inode, err := fs.createOrFindInodeStat(inoKey, &stat) + if inode == nil { + log.Warningf("could not create or find inode") + return nil, err + } d := &directfsDentry{ dentry: dentry{ - fs: fs, - inoKey: inoKey, - ino: fs.inoFromKey(inoKey), - mode: atomicbitops.FromUint32(stat.Mode), - uid: atomicbitops.FromUint32(stat.Uid), - gid: atomicbitops.FromUint32(stat.Gid), - blockSize: atomicbitops.FromUint32(uint32(stat.Blksize)), - readFD: atomicbitops.FromInt32(-1), - writeFD: atomicbitops.FromInt32(-1), - mmapFD: atomicbitops.FromInt32(-1), - size: atomicbitops.FromUint64(uint64(stat.Size)), - atime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Atim)), - mtime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Mtim)), - ctime: atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Ctim)), - nlink: atomicbitops.FromUint32(uint32(stat.Nlink)), + inode: inode, }, controlFD: controlFD, } @@ -159,7 +149,7 @@ func (d *directfsDentry) openHandle(ctx context.Context, flags uint32) (handle, if err != nil { return noHandle, err } - d.fs.client.CloseFD(ctx, openFD, true /* flush */) + d.inode.fs.client.CloseFD(ctx, openFD, true /* flush */) if hostFD < 0 { log.Warningf("gofer did not donate an FD for mount point") return noHandle, unix.EIO @@ -170,7 +160,7 @@ func (d *directfsDentry) openHandle(ctx context.Context, flags uint32) (handle, // The only way to re-open an FD with different flags is via procfs or // openat(2) from the parent. Procfs does not exist here. So use parent. flags |= hostOpenFlags - openFD, err := unix.Openat(parent.impl.(*directfsDentry).controlFD, d.name, int(flags), 0) + openFD, err := unix.Openat(parent.inode.impl.(*directfsDentry).controlFD, d.name, int(flags), 0) if err != nil { return noHandle, err } @@ -179,8 +169,8 @@ func (d *directfsDentry) openHandle(ctx context.Context, flags uint32) (handle, // Precondition: fs.renameMu is locked. func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { - d.handleMu.Lock() - defer d.handleMu.Unlock() + d.inode.handleMu.Lock() + defer d.inode.handleMu.Unlock() if d.controlFDLisa.Ok() { return nil } @@ -189,7 +179,7 @@ func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { root := d for root.parent.Load() != nil { names = append(names, root.name) - root = root.parent.Load().impl.(*directfsDentry) + root = root.parent.Load().inode.impl.(*directfsDentry) } if !root.controlFDLisa.Ok() { panic("controlFDLisa is not set for mount point dentry") @@ -210,7 +200,7 @@ func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { // Close everything except for inodes[last] if it exists. for i := 0; i < len(inodes) && i < last; i++ { flush := i == last-1 || i == len(inodes)-1 - d.fs.client.CloseFD(ctx, inodes[i].ControlFD, flush) + d.inode.fs.client.CloseFD(ctx, inodes[i].ControlFD, flush) } }() switch status { @@ -220,15 +210,15 @@ func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error { log.Warningf("intermediate path component was a symlink? names = %v, inodes = %+v", names, inodes) return unix.ELOOP case lisafs.WalkSuccess: - d.controlFDLisa = d.fs.client.NewFD(inodes[last].ControlFD) + d.controlFDLisa = d.inode.fs.client.NewFD(inodes[last].ControlFD) return nil } panic("unreachable") } -// Precondition: d.metadataMu must be locked. +// Precondition: d.inode.metadataMu must be locked. // -// +checklocks:d.metadataMu +// +checklocks:d.inode.metadataMu func (d *directfsDentry) updateMetadataLocked(h handle) error { handleMuRLocked := false if h.fd < 0 { @@ -238,17 +228,17 @@ func (d *directfsDentry) updateMetadataLocked(h handle) error { // filesystem implementations may update a writable FD's metadata after // writes, without making metadata updates immediately visible to read-only // FDs representing the same file. - d.handleMu.RLock() + d.inode.handleMu.RLock() switch { - case d.writeFD.RacyLoad() >= 0: - h.fd = d.writeFD.RacyLoad() + case d.inode.writeFD.RacyLoad() >= 0: + h.fd = d.inode.writeFD.RacyLoad() handleMuRLocked = true - case d.readFD.RacyLoad() >= 0: - h.fd = d.readFD.RacyLoad() + case d.inode.readFD.RacyLoad() >= 0: + h.fd = d.inode.readFD.RacyLoad() handleMuRLocked = true default: h.fd = int32(d.controlFD) - d.handleMu.RUnlock() + d.inode.handleMu.RUnlock() } } @@ -256,7 +246,7 @@ func (d *directfsDentry) updateMetadataLocked(h handle) error { err := unix.Fstat(int(h.fd), &stat) if handleMuRLocked { // handleMu must be released before updateMetadataFromStatLocked(). - d.handleMu.RUnlock() // +checklocksforce: complex case. + d.inode.handleMu.RUnlock() // +checklocksforce: complex case. } if err != nil { return err @@ -278,7 +268,7 @@ func (d *directfsDentry) chmod(ctx context.Context, mode uint16) error { // Sockets use O_PATH control FDs. However, fchmod(2) fails with EBADF for // O_PATH FDs. Try to fchmodat(2) it from its parent. if parent := d.parent.Load(); parent != nil { - return unix.Fchmodat(parent.impl.(*directfsDentry).controlFD, d.name, uint32(mode), 0 /* flags */) + return unix.Fchmodat(parent.inode.impl.(*directfsDentry).controlFD, d.name, uint32(mode), 0 /* flags */) } // This is a mount point socket (no parent). Fallback to using lisafs. @@ -289,7 +279,7 @@ func (d *directfsDentry) chmod(ctx context.Context, mode uint16) error { } // Preconditions: -// - d.handleMu is locked if d is a regular file. +// - d.inode.handleMu is locked if d is a regular file. // - fs.renameMu is locked if d is a symlink. func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error { if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME) == 0 { @@ -314,7 +304,7 @@ func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error if d.isRegularFile() { // utimensat(2) requires a writable FD for regular files. See BUGS // section. dentry.prepareSetStat() should have acquired a writable FD. - hostFD = int(d.writeFD.RacyLoad()) + hostFD = int(d.inode.writeFD.RacyLoad()) } // Non-symlinks can operate directly on the fd using an empty name. return fsutil.Utimensat(hostFD, "", utimes, 0) @@ -324,7 +314,7 @@ func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error // symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty // name. if parent := d.parent.Load(); parent != nil { - return fsutil.Utimensat(parent.impl.(*directfsDentry).controlFD, d.name, utimes, unix.AT_SYMLINK_NOFOLLOW) + return fsutil.Utimensat(parent.inode.impl.(*directfsDentry).controlFD, d.name, utimes, unix.AT_SYMLINK_NOFOLLOW) } // This is a mount point symlink. We don't have a parent FD. Fallback to @@ -357,7 +347,7 @@ func (d *directfsDentry) prepareSetStat(ctx context.Context, stat *linux.Statx) } // Preconditions: -// - d.handleMu is locked. +// - d.inode.handleMu is locked. // - fs.renameMu is locked. func (d *directfsDentry) setStatLocked(ctx context.Context, stat *linux.Statx) (failureMask uint32, failureErr error) { if stat.Mask&unix.STATX_MODE != 0 { @@ -369,7 +359,7 @@ func (d *directfsDentry) setStatLocked(ctx context.Context, stat *linux.Statx) ( if stat.Mask&unix.STATX_SIZE != 0 { // ftruncate(2) requires a writable FD. - if err := unix.Ftruncate(int(d.writeFD.RacyLoad()), int64(stat.Size)); err != nil { + if err := unix.Ftruncate(int(d.inode.writeFD.RacyLoad()), int64(stat.Size)); err != nil { failureMask |= unix.STATX_SIZE failureErr = err } @@ -414,7 +404,7 @@ func fchown(fd int, uid auth.KUID, gid auth.KGID) error { return unix.Fchownat(fd, "", u, g, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) } -// Precondition: d.handleMu must be locked. +// Precondition: d.inode.handleMu must be locked. func (d *directfsDentry) destroy(ctx context.Context) { if d.controlFD >= 0 { _ = unix.Close(d.controlFD) @@ -432,7 +422,7 @@ func (d *directfsDentry) getHostChild(name string) (*dentry, error) { if err != nil { return nil, err } - return d.fs.newDirectfsDentry(childFD) + return d.inode.fs.newDirectfsDentry(childFD) } func (d *directfsDentry) getXattr(ctx context.Context, name string, size uint64) (string, error) { @@ -464,7 +454,7 @@ func (d *directfsDentry) getCreatedChild(name string, uid auth.KUID, gid auth.KG deleteChild := func() { // Best effort attempt to remove the newly created child on failure. if err := unix.Unlinkat(d.controlFD, name, unlinkFlags); err != nil { - log.Warningf("error unlinking newly created child %q after failure: %v", filepath.Join(genericDebugPathname(d.fs, &d.dentry), name), err) + log.Warningf("error unlinking newly created child %q after failure: %v", filepath.Join(genericDebugPathname(d.inode.fs, &d.dentry), name), err) } } @@ -484,7 +474,7 @@ func (d *directfsDentry) getCreatedChild(name string, uid auth.KUID, gid auth.KG var child *dentry if createDentry { - child, err = d.fs.newDirectfsDentry(childFD) + child, err = d.inode.fs.newDirectfsDentry(childFD) if err != nil { // Ownership of childFD was passed to newDirectDentry(), so no need to // clean that up. @@ -525,12 +515,12 @@ func (d *directfsDentry) bindAt(ctx context.Context, name string, creds *auth.Cr if err != nil { return nil, err } - d.fs.client.CloseFD(ctx, childInode.ControlFD, true /* flush */) + d.inode.fs.client.CloseFD(ctx, childInode.ControlFD, true /* flush */) // Update opts.Endpoint that it is bound. hbep := opts.Endpoint.(transport.HostBoundEndpoint) if err := hbep.SetBoundSocketFD(ctx, boundSocketFD); err != nil { if err := unix.Unlinkat(d.controlFD, name, 0); err != nil { - log.Warningf("error unlinking newly created socket %q after failure: %v", filepath.Join(genericDebugPathname(d.fs, &d.dentry), name), err) + log.Warningf("error unlinking newly created socket %q after failure: %v", filepath.Join(genericDebugPathname(d.inode.fs, &d.dentry), name), err) } return nil, err } @@ -542,12 +532,12 @@ func (d *directfsDentry) bindAt(ctx context.Context, name string, creds *auth.Cr } // Set the endpoint on the newly created child dentry, and take the // corresponding extra dentry reference. - child.endpoint = opts.Endpoint + child.inode.endpoint = opts.Endpoint child.IncRef() return child, nil } -// Precondition: d.fs.renameMu must be locked. +// Precondition: d.inode.fs.renameMu must be locked. func (d *directfsDentry) link(target *directfsDentry, name string) (*dentry, error) { // Using linkat(targetFD, "", newdirfd, name, AT_EMPTY_PATH) requires // CAP_DAC_READ_SEARCH in the *root* userns. With directfs, the sandbox @@ -556,7 +546,7 @@ func (d *directfsDentry) link(target *directfsDentry, name string) (*dentry, err // using olddirfd to call linkat(2). // Also note that d and target are from the same mount. Given target is a // non-directory and d is a directory, target.parent must exist. - if err := unix.Linkat(target.parent.Load().impl.(*directfsDentry).controlFD, target.name, d.controlFD, name, 0); err != nil { + if err := unix.Linkat(target.parent.Load().inode.impl.(*directfsDentry).controlFD, target.name, d.controlFD, name, 0); err != nil { return nil, err } // Note that we don't need to set uid/gid for the new child. This is a hard @@ -596,7 +586,7 @@ func (d *directfsDentry) openCreate(name string, accessFlags uint32, mode linux. } func (d *directfsDentry) getDirentsLocked(recordDirent func(name string, key inoKey, dType uint8)) error { - readFD := int(d.readFD.RacyLoad()) + readFD := int(d.inode.readFD.RacyLoad()) if _, err := unix.Seek(readFD, 0, 0); err != nil { return err } @@ -607,7 +597,7 @@ func (d *directfsDentry) getDirentsLocked(recordDirent func(name string, key ino // TODO(gvisor.dev/issue/6665): Get rid of per-dirent stat. stat, err := fsutil.StatAt(d.controlFD, name) if err != nil { - log.Warningf("Getdent64: skipping file %q with failed stat, err: %v", path.Join(genericDebugPathname(d.fs, &d.dentry), name), err) + log.Warningf("Getdent64: skipping file %q with failed stat, err: %v", path.Join(genericDebugPathname(d.inode.fs, &d.dentry), name), err) return } recordDirent(name, inoKeyFromStat(&stat), ftype) @@ -664,7 +654,7 @@ func (d *directfsDentry) restoreFile(ctx context.Context, controlFD int, opts *v var stat unix.Stat_t if err := unix.Fstat(controlFD, &stat); err != nil { _ = unix.Close(controlFD) - return fmt.Errorf("failed to stat %q: %w", genericDebugPathname(d.fs, &d.dentry), err) + return fmt.Errorf("failed to stat %q: %w", genericDebugPathname(d.inode.fs, &d.dentry), err) } d.controlFD = controlFD @@ -675,23 +665,23 @@ func (d *directfsDentry) restoreFile(ctx context.Context, controlFD int, opts *v // checking inoKey. // // - We need to associate the new inoKey with the existing d.ino. - d.inoKey = inoKeyFromStat(&stat) - d.fs.inoMu.Lock() - d.fs.inoByKey[d.inoKey] = d.ino - d.fs.inoMu.Unlock() + d.inode.inoKey = inoKeyFromStat(&stat) + d.inode.fs.inoMu.Lock() + d.inode.fs.inoByKey[d.inode.inoKey] = d.inode.ino + d.inode.fs.inoMu.Unlock() // Check metadata stability before updating metadata. - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() if d.isRegularFile() { if opts.ValidateFileSizes { - if d.size.RacyLoad() != uint64(stat.Size) { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d.fs, &d.dentry), d.size.Load(), stat.Size)} + if d.inode.size.RacyLoad() != uint64(stat.Size) { + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d.inode.fs, &d.dentry), d.inode.size.Load(), stat.Size)} } } if opts.ValidateFileModificationTimestamps { - if want := dentryTimestampFromUnix(stat.Mtim); d.mtime.RacyLoad() != want { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d.fs, &d.dentry), linux.NsecToStatxTimestamp(d.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} + if want := dentryTimestampFromUnix(stat.Mtim); d.inode.mtime.RacyLoad() != want { + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d.inode.fs, &d.dentry), linux.NsecToStatxTimestamp(d.inode.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} } } } @@ -699,9 +689,9 @@ func (d *directfsDentry) restoreFile(ctx context.Context, controlFD int, opts *v d.updateMetadataFromStatLocked(&stat) } - if rw, ok := d.fs.savedDentryRW[&d.dentry]; ok { + if rw, ok := d.inode.fs.savedDentryRW[&d.dentry]; ok { if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil { - return fmt.Errorf("failed to restore file handles (read=%t, write=%t) for %q: %w", rw.read, rw.write, genericDebugPathname(d.fs, &d.dentry), err) + return fmt.Errorf("failed to restore file handles (read=%t, write=%t) for %q: %w", rw.read, rw.write, genericDebugPathname(d.inode.fs, &d.dentry), err) } } @@ -719,7 +709,7 @@ func doRevalidationDirectfs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, // The function receiver has to be named `d` (to be consistent with other // receivers). But `d` variable is also used below in various places. This // helps with readability and makes code less error prone. - start := state.start.impl.(*directfsDentry) + start := state.start.inode.impl.(*directfsDentry) if state.refreshStart { start.updateMetadata(ctx) } @@ -733,20 +723,20 @@ func doRevalidationDirectfs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, var stat unix.Stat_t // Lock metadata *before* getting attributes for d. - d.metadataMu.Lock() + d.inode.metadataMu.Lock() found := err == nil if found { err = unix.Fstat(childFD, &stat) _ = unix.Close(childFD) if err != nil { - d.metadataMu.Unlock() + d.inode.metadataMu.Unlock() return err } } // Note that synthetic dentries will always fail this comparison check. - if !found || d.inoKey != inoKeyFromStat(&stat) { - d.metadataMu.Unlock() + if !found || d.inode.inoKey != inoKeyFromStat(&stat) { + d.inode.metadataMu.Unlock() if !found && d.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace // it. @@ -759,11 +749,11 @@ func doRevalidationDirectfs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, } // The file at this path hasn't changed. Just update cached metadata. - d.impl.(*directfsDentry).updateMetadataFromStatLocked(&stat) // +checklocksforce: d.metadataMu is locked above. - d.metadataMu.Unlock() + d.inode.impl.(*directfsDentry).updateMetadataFromStatLocked(&stat) // +checklocksforce: d.inode.metadataMu is locked above. + d.inode.metadataMu.Unlock() // Advance parent. - parent = d.impl.(*directfsDentry) + parent = d.inode.impl.(*directfsDentry) } return nil } diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 31c7d9c62a..1dd65a0e8d 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -51,7 +51,7 @@ func (d *dentry) isDir() bool { // +checklocks:d.childrenMu func (d *dentry) cacheNewChildLocked(child *dentry, name string) { d.IncRef() // reference held by child on its parent - genericSetParentAndName(d.fs, child, d, name) + genericSetParentAndName(d.inode.fs, child, d, name) if d.children == nil { d.children = make(map[string]*dentry) } else if c, ok := d.children[name]; ok { @@ -77,7 +77,7 @@ func (d *dentry) cacheNegativeLookupLocked(name string) { // this makes remote lookup unavoidable), or if d.isSynthetic() (in which // case the only files in the directory are those for which a dentry exists // in d.children). Instead, just delete any previously-cached dentry. - if d.fs.opts.interop == InteropModeShared || d.isSynthetic() { + if d.inode.fs.opts.interop == InteropModeShared || d.isSynthetic() { delete(d.children, name) return } @@ -124,30 +124,32 @@ type createSyntheticOpts struct { // newSyntheticDentry creates a synthetic file with the given name. func (fs *filesystem) newSyntheticDentry(opts *createSyntheticOpts) *dentry { now := fs.clock.Now().Nanoseconds() + inode := new(inode) + inode.fs = fs + inode.ino = fs.nextIno() + inode.mode.Store(uint32(opts.mode)) + inode.uid.Store(uint32(opts.kuid)) + inode.gid.Store(uint32(opts.kgid)) + inode.blockSize.Store(hostarch.PageSize) + inode.atime.Store(now) + inode.mtime.Store(now) + inode.ctime.Store(now) + inode.btime.Store(now) + inode.nlink.Store(2) + inode.readFD.Store(-1) + inode.writeFD.Store(-1) + inode.mmapFD.Store(-1) child := &dentry{ - refs: atomicbitops.FromInt64(1), // held by parent. - fs: fs, - ino: fs.nextIno(), - mode: atomicbitops.FromUint32(uint32(opts.mode)), - uid: atomicbitops.FromUint32(uint32(opts.kuid)), - gid: atomicbitops.FromUint32(uint32(opts.kgid)), - blockSize: atomicbitops.FromUint32(hostarch.PageSize), // arbitrary - atime: atomicbitops.FromInt64(now), - mtime: atomicbitops.FromInt64(now), - ctime: atomicbitops.FromInt64(now), - btime: atomicbitops.FromInt64(now), - readFD: atomicbitops.FromInt32(-1), - writeFD: atomicbitops.FromInt32(-1), - mmapFD: atomicbitops.FromInt32(-1), - nlink: atomicbitops.FromUint32(2), + refs: atomicbitops.FromInt64(1), // held by parent. + inode: inode, } switch opts.mode.FileType() { case linux.S_IFDIR: // Nothing else needs to be done. case linux.S_IFSOCK: - child.endpoint = opts.endpoint + child.inode.endpoint = opts.endpoint case linux.S_IFIFO: - child.pipe = opts.pipe + child.inode.pipe = opts.pipe default: panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType())) } @@ -224,8 +226,8 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // filesystem.renameMu is needed for d.parent, and must be locked before // d.opMu. - d.fs.renameMu.RLock() - defer d.fs.renameMu.RUnlock() + d.inode.fs.renameMu.RLock() + defer d.inode.fs.renameMu.RUnlock() d.opMu.RLock() defer d.opMu.RUnlock() @@ -248,24 +250,24 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { { Name: ".", Type: linux.DT_DIR, - Ino: uint64(d.ino), + Ino: uint64(d.inode.ino), NextOff: 1, }, { Name: "..", - Type: uint8(parent.mode.Load() >> 12), - Ino: uint64(parent.ino), + Type: uint8(parent.inode.mode.Load() >> 12), + Ino: uint64(parent.inode.ino), NextOff: 2, }, } var realChildren map[string]struct{} if !d.isSynthetic() { - if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared { + if d.syntheticChildren != 0 && d.inode.fs.opts.interop == InteropModeShared { // Record the set of children d actually has so that we don't emit // duplicate entries for synthetic children. realChildren = make(map[string]struct{}) } - d.handleMu.RLock() + d.inode.handleMu.RLock() if !d.isReadHandleOk() { // This should not be possible because a readable handle should // have been opened when the calling directoryFD was opened. @@ -274,7 +276,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { err := d.getDirentsLocked(ctx, func(name string, key inoKey, dType uint8) { dirent := vfs.Dirent{ Name: name, - Ino: d.fs.inoFromKey(key), + Ino: d.inode.fs.inoFromKey(key), NextOff: int64(len(dirents) + 1), Type: dType, } @@ -283,7 +285,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { realChildren[name] = struct{}{} } }) - d.handleMu.RUnlock() + d.inode.handleMu.RUnlock() if err != nil { return nil, err } @@ -300,8 +302,8 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { } dirents = append(dirents, vfs.Dirent{ Name: child.name, - Type: uint8(child.mode.Load() >> 12), - Ino: uint64(child.ino), + Type: uint8(child.inode.mode.Load() >> 12), + Ino: uint64(child.inode.ino), NextOff: int64(len(dirents) + 1), }) } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 2f5e905607..5499db9515 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -502,7 +502,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } // No cached dentry exists; however, in InteropModeShared there might still be @@ -534,7 +534,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } @@ -602,7 +602,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // Load child if sticky bit is set because we need to determine whether // deletion is allowed. var child *dentry - if parent.mode.Load()&linux.ModeSticky == 0 { + if parent.inode.mode.Load()&linux.ModeSticky == 0 { var ok bool parent.childrenMu.Lock() child, ok = parent.children[name] @@ -699,13 +699,13 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // Generate inotify events for rmdir or unlink. if dir { - parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + parent.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) } else { var cw *vfs.Watches if child != nil { - cw = &child.watches + cw = &child.inode.watches } - vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) + vfs.InotifyRemoveChild(ctx, cw, &parent.inode.watches, name) } parent.childrenMu.Lock() @@ -723,9 +723,10 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if child.isSynthetic() { parent.syntheticChildren-- child.decRefNoCaching() - } else if child.endpoint != nil { + } else if child.inode.endpoint != nil { child.decRefNoCaching() } + child.decLinks() ds = appendDentry(ds, child) } parent.cacheNegativeLookupLocked(name) @@ -806,29 +807,26 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. if d.isDir() { return nil, linuxerr.EPERM } - gid := auth.KGID(d.gid.Load()) - uid := auth.KUID(d.uid.Load()) - mode := linux.FileMode(d.mode.Load()) + gid := auth.KGID(d.inode.gid.Load()) + uid := auth.KUID(d.inode.uid.Load()) + mode := linux.FileMode(d.inode.mode.Load()) if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil { return nil, err } - if d.nlink.Load() == 0 { + if d.inode.nlink.Load() == 0 { return nil, linuxerr.ENOENT } - if d.nlink.Load() == math.MaxUint32 { + if d.inode.nlink.Load() == math.MaxUint32 { return nil, linuxerr.EMLINK } if d.isSynthetic() { // TODO(gvisor.dev/issue/6739): Add synthetic file hard link support. return nil, linuxerr.EOPNOTSUPP } + d.incLinks() return parent.link(ctx, d, name) }, nil) - if err == nil { - // Success! - vd.Dentry().Impl().(*dentry).incLinks() - } return err } @@ -840,8 +838,8 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // rather than the caller's and enable setgid. kgid := creds.EffectiveKGID mode := opts.Mode - if parent.mode.Load()&linux.S_ISGID != 0 { - kgid = auth.KGID(parent.gid.Load()) + if parent.inode.mode.Load()&linux.S_ISGID != 0 { + kgid = auth.KGID(parent.inode.gid.Load()) mode |= linux.S_ISGID } @@ -1075,16 +1073,16 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open // also required by d.connect() which is called by // d.openSocketByConnecting(). Note that opening non-synthetic pipes may // block, renameMu is unlocked separately in d.openSpecialFile() for pipes. - d.fs.renameMu.RLock() - defer d.fs.renameMu.RUnlock() + d.inode.fs.renameMu.RLock() + defer d.inode.fs.renameMu.RUnlock() } trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG if trunc { // Lock metadataMu *while* we open a regular file with O_TRUNC because // open(2) will change the file size on server. - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() } var vfd *vfs.FileDescription @@ -1092,7 +1090,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open mnt := rp.Mount() switch d.fileType() { case linux.S_IFREG: - if !d.fs.opts.regularFilesUseSpecialFileFD { + if !d.inode.fs.opts.regularFilesUseSpecialFileFD { if err := d.ensureSharedHandle(ctx, ats.MayRead(), ats.MayWrite(), trunc); err != nil { return nil, err } @@ -1120,11 +1118,11 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open } } fd := &directoryFD{} - fd.LockFD.Init(&d.locks) + fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } - if d.readFD.Load() >= 0 { + if d.inode.readFD.Load() >= 0 { fsmetric.GoferOpensHost.Increment() } else { fsmetric.GoferOpens9P.Increment() @@ -1137,14 +1135,14 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open if d.isSynthetic() { return nil, linuxerr.ENXIO } - if d.fs.iopts.OpenSocketsByConnecting { + if d.inode.fs.iopts.OpenSocketsByConnecting { return d.openSocketByConnecting(ctx, opts) } case linux.S_IFIFO: if d.isSynthetic() { - return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) + return d.inode.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.inode.locks) } - if d.fs.opts.disableFifoOpen { + if d.inode.fs.opts.disableFifoOpen { logRejectedFifoOpenOnce.Do(func() { log.Warningf("Rejecting attempt to open fifo/pipe from host filesystem: %q. If you want to allow this, set flag --host-fifo=open", d.name) }) @@ -1221,9 +1219,9 @@ retry: // with ENXIO if opening the same named pipe with O_WRONLY would // block because there are no readers of the pipe. Release renameMu // while blocking. - d.fs.renameMu.RUnlock() + d.inode.fs.renameMu.RUnlock() err := sleepBetweenNamedPipeOpenChecks(ctx) - d.fs.renameMu.RLock() + d.inode.fs.renameMu.RLock() if err != nil { return nil, err } @@ -1233,9 +1231,9 @@ retry: } if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 { // Release renameMu while blocking. - d.fs.renameMu.RUnlock() + d.inode.fs.renameMu.RUnlock() err := blockUntilNonblockingPipeHasWriter(ctx, h.fd) - d.fs.renameMu.RLock() + d.inode.fs.renameMu.RLock() if err != nil { h.close(ctx) return nil, err @@ -1250,7 +1248,7 @@ retry: } // Preconditions: -// - d.fs.renameMu must be locked. +// - d.inode.fs.renameMu must be locked. // - d.opMu must be locked for writing. // - !d.isSynthetic(). // @@ -1273,8 +1271,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // If the parent is a setgid directory, use the parent's GID rather // than the caller's. kgid := creds.EffectiveKGID - if d.mode.Load()&linux.S_ISGID != 0 { - kgid = auth.KGID(d.gid.Load()) + if d.inode.mode.Load()&linux.S_ISGID != 0 { + kgid = auth.KGID(d.inode.gid.Load()) } child, h, err := d.openCreate(ctx, name, opts.Flags&linux.O_ACCMODE, opts.Mode, creds.EffectiveKUID, kgid, true /* createDentry */) @@ -1283,23 +1281,23 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } // Incorporate the fid that was opened by lcreate. - useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD + useRegularFileFD := child.fileType() == linux.S_IFREG && !d.inode.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { var readable, writable bool - child.handleMu.Lock() + child.inode.handleMu.Lock() if vfs.MayReadFileWithOpenFlags(opts.Flags) { readable = true if h.fd != -1 { - child.readFD = atomicbitops.FromInt32(h.fd) - child.mmapFD = atomicbitops.FromInt32(h.fd) + child.inode.readFD = atomicbitops.FromInt32(h.fd) + child.inode.mmapFD = atomicbitops.FromInt32(h.fd) } } if vfs.MayWriteFileWithOpenFlags(opts.Flags) { writable = true - child.writeFD = atomicbitops.FromInt32(h.fd) + child.inode.writeFD = atomicbitops.FromInt32(h.fd) } child.updateHandles(ctx, h, readable, writable) - child.handleMu.Unlock() + child.inode.handleMu.Unlock() } // Insert the dentry into the tree. d.childrenMu.Lock() @@ -1329,7 +1327,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } - d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + d.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) return childVFSFD, nil } @@ -1520,7 +1518,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if replaced.isSynthetic() { newParent.syntheticChildren-- replaced.decRefNoCaching() - } else if replaced.endpoint != nil { + } else if replaced.inode.endpoint != nil { replaced.decRefNoCaching() } ds = appendDentry(ds, replaced) @@ -1560,7 +1558,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newParent.incLinks() } } - vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) + vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParent.inode.watches, &newParent.inode.watches, oldName, newName, renamed.isDir()) return nil } @@ -1647,14 +1645,14 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ if err != nil { return nil, err } - if parent.fs.opts.interop != InteropModeShared { + if parent.inode.fs.opts.interop != InteropModeShared { // Cache the symlink target on creation. In practice, this helps avoid a // lot of ReadLink RPCs. Note that when InteropModeShared is in effect, // we are forced to make Readlink RPCs. Because in this mode, we use host // timestamps, not timestamps based on our internal clock. And readlink // updates the atime on the host. - child.haveTarget = true - child.target = target + child.inode.haveTarget = true + child.inode.target = target } return child, nil }, nil) @@ -1680,8 +1678,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath if !d.isSocket() { return nil, linuxerr.ECONNREFUSED } - if d.endpoint != nil { - return d.endpoint, nil + if d.inode.endpoint != nil { + return d.inode.endpoint, nil } if !d.isSynthetic() { d.IncRef() diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index a6b32ca00c..ffb0f9d356 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -236,8 +236,10 @@ type filesystem struct { // across checkpoint/restore because inode numbers may be reused between // different gofer processes, so inode numbers may be repeated for different // files across checkpoint/restore. inoByKey is protected by inoMu. - inoMu sync.Mutex `state:"nosave"` - inoByKey map[inoKey]uint64 `state:"nosave"` + inoMu sync.Mutex `state:"nosave"` + inoByKey map[inoKey]uint64 `state:"nosave"` + inodeByKeyMu sync.Mutex `state:"nosave"` + inodeByKey map[inoKey]*inode `state:"nosave"` // lastIno is the last inode number assigned to a file. lastIno is accessed // using atomic memory operations. @@ -554,12 +556,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, err } fs := &filesystem{ - mf: mf, - opts: fsopts, - iopts: iopts, - clock: ktime.RealtimeClockFromContext(ctx), - devMinor: devMinor, - inoByKey: make(map[inoKey]uint64), + mf: mf, + opts: fsopts, + iopts: iopts, + clock: ktime.RealtimeClockFromContext(ctx), + devMinor: devMinor, + inoByKey: make(map[inoKey]uint64), + inodeByKey: make(map[inoKey]*inode), } // Did the user configure a global dentry cache? @@ -592,6 +595,64 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return &fs.vfsfs, &fs.root.vfsd, nil } +func (fs *filesystem) createOrFindInodeStatx(inoKey inoKey, statx *linux.Statx) (*inode, error) { + fs.inodeByKeyMu.Lock() + defer fs.inodeByKeyMu.Unlock() + inodePtr := fs.inodeByKey[inoKey] + if inodePtr != nil { + inodePtr.nlink.Store(uint32(statx.Nlink)) + return inodePtr, nil + } + inodePtr = new(inode) + inodePtr.fs = fs + inodePtr.inoKey = inoKey + fs.inodeByKey[inoKey] = inodePtr + inodePtr.nlink.Store(uint32(statx.Nlink)) + // metadataMu is locked by the caller. + inodePtr.metadataMu.Lock() + inodePtr.ino = fs.inoFromKey(inoKey) + inodePtr.mode.Store(uint32(statx.Mode)) + inodePtr.uid.Store(uint32(fs.opts.dfltuid)) + inodePtr.gid.Store(uint32(fs.opts.dfltgid)) + inodePtr.blockSize.Store(uint32(hostarch.PageSize)) + inodePtr.metadataMu.Unlock() + inodePtr.readFD.Store(-1) + inodePtr.writeFD.Store(-1) + inodePtr.mmapFD.Store(-1) + return inodePtr, nil +} + +func (fs *filesystem) createOrFindInodeStat(inoKey inoKey, stat *unix.Stat_t) (*inode, error) { + fs.inodeByKeyMu.Lock() + defer fs.inodeByKeyMu.Unlock() + inodePtr := fs.inodeByKey[inoKey] + if inodePtr != nil { + inodePtr.nlink.Store(uint32(stat.Nlink)) + return inodePtr, nil + } + inodePtr = new(inode) + fs.inodeByKey[inoKey] = inodePtr + inodePtr.fs = fs + inodePtr.inoKey = inoKey + inodePtr.size.Store(uint64(stat.Size)) + inodePtr.nlink.Store(uint32(stat.Nlink)) + // metadataMu is locked by the caller. + inodePtr.metadataMu.Lock() + inodePtr.ino = fs.inoFromKey(inoKey) + inodePtr.mode.Store(uint32(stat.Mode)) + inodePtr.uid.Store(uint32(stat.Uid)) + inodePtr.gid.Store(uint32(stat.Gid)) + inodePtr.blockSize.Store(uint32(stat.Blksize)) + inodePtr.atime.Store(dentryTimestampFromUnix(stat.Atim)) + inodePtr.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) + inodePtr.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) + inodePtr.metadataMu.Unlock() + inodePtr.readFD.Store(-1) + inodePtr.writeFD.Store(-1) + inodePtr.mmapFD.Store(-1) + return inodePtr, nil +} + // initClientAndGetRoot initializes fs.client and returns the root inode for // this mount point. It handles the attach point (fs.opts.aname) resolution. func (fs *filesystem) initClientAndGetRoot(ctx context.Context) (lisafs.Inode, int, error) { @@ -696,23 +757,23 @@ func (fs *filesystem) Release(ctx context.Context) { fs.syncMu.Lock() for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() { d := elem.d - d.handleMu.Lock() - d.dataMu.Lock() + d.inode.handleMu.Lock() + d.inode.dataMu.Lock() if d.isWriteHandleOk() { // Write dirty cached data to the remote file. h := d.writeHandle() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirtyAll(ctx, &d.inode.cache, &d.inode.dirty, d.inode.size.Load(), mf, h.writeFromBlocksAt); err != nil { log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) } // TODO(jamieliu): Do we need to flushf/fsync d? } // Discard cached pages. - d.cache.DropAll(mf) - d.dirty.RemoveAll() - d.dataMu.Unlock() + d.inode.cache.DropAll(mf) + d.inode.dirty.RemoveAll() + d.inode.dataMu.Unlock() // Close host FDs if they exist. d.closeHostFDs() - d.handleMu.Unlock() + d.inode.handleMu.Unlock() } // There can't be any specialFileFDs still using fs, since each such // FileDescription would hold a reference on a Mount holding a reference on @@ -750,9 +811,9 @@ func (fs *filesystem) Release(ctx context.Context) { // endpoint != nil. Such dentries have one reference for existence that should // be dropped during filesystem.Release. // -// Precondition: d.fs.renameMu is locked for writing. +// Precondition: d.inode.fs.renameMu is locked for writing. func (d *dentry) releaseExtraRefsRecursiveLocked(ctx context.Context) { - if d.isSynthetic() || d.endpoint != nil { + if d.isSynthetic() || d.inode.endpoint != nil { d.decRefNoCaching() d.checkCachingLocked(ctx, true /* renameMuWriteLocked */) } @@ -796,6 +857,137 @@ func inoKeyFromStat(stat *unix.Stat_t) inoKey { } } +// inode represents a filesystem object. +// +// +stateify savable +type inode struct { + // fs is the filesystem that this inode belongs to. + fs *filesystem + + // A reference is held on all inodes as long as they are reachable in the + // filesystem tree, i.e. nlink is nonzero. This reference is dropped when + // nlink reaches 0. + refs atomicbitops.Int64 + + // inode key + inoKey inoKey + + // File size, which differs from other metadata in two ways: + // + // - We make a best-effort attempt to keep it up to date even if + // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. + // + // - size is protected by both metadataMu and dataMu (i.e. both must be + // locked to mutate it; locking either is sufficient to access it). + size atomicbitops.Uint64 + + // inode metadata. Writing multiple fields atomically requires holding + // mu, otherwise atomic operations can be used. + metadataMu sync.Mutex `state:"nosave"` + mode atomicbitops.Uint32 // file type and mode + nlink atomicbitops.Uint32 // protected by filesystem.mu instead of inode.mu + uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid atomicbitops.Uint32 // auth.KGID, but ... + blockSize atomicbitops.Uint32 // 0 if unknown + ino uint64 // immutable + // Timestamps, all nsecs from the Unix epoch. + atime atomicbitops.Int64 + mtime atomicbitops.Int64 + ctime atomicbitops.Int64 + btime atomicbitops.Int64 + + // If this inode does not represent a synthetic file, deleted is 0, and + // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the + // remote file's timestamps, which should be updated when this inode is + // evicted. + atimeDirty atomicbitops.Uint32 `state:"nosave"` + mtimeDirty atomicbitops.Uint32 + + mapsMu sync.Mutex `state:"nosave"` + + // If this inode represents a regular file, mappings tracks mappings of + // the file into memmap.MappingSpaces. mappings is protected by mapsMu. + mappings memmap.MappingSet + + dataMu sync.RWMutex `state:"nosave"` + + // If this inode represents a regular file that is client-cached, cache + // maps offsets into the cached file to offsets into + // filesystem.mfp.MemoryFile() that store the file's data. cache is + // protected by dataMu. + cache fsutil.FileRangeSet + + // If this inode represents a regular file that is client-cached, dirty + // tracks dirty segments in cache. dirty is protected by dataMu. + dirty fsutil.DirtySet + + // If this inode represents a deleted regular file, savedDeletedData is used + // to store file data for save/restore. + savedDeletedData []byte + + locks vfs.FileLocks + + // Inotify watches for this inode. + watches vfs.Watches + + // - If this dentry represents a regular file or directory, readFD (if not + // -1) is a host FD used for reads by all regularFileFDs/directoryFDs + // representing this dentry. + // + // - If this dentry represents a regular file, writeFD (if not -1) is a host + // FD used for writes by all regularFileFDs representing this dentry. + // + // - If this dentry represents a regular file, mmapFD is the host FD used + // for memory mappings. If mmapFD is -1, no such FD is available, and the + // internal page cache implementation is used for memory mappings instead. + // + // These fields are protected by handleMu. readFD, writeFD, and mmapFD are + // additionally written using atomic memory operations, allowing them to be + // read (albeit racily) with atomic.LoadInt32() without locking handleMu. + // + // readFD and writeFD may or may not be the same file descriptor. Once either + // transitions from closed (-1) to open, it may be mutated with handleMu + // locked, but cannot be closed until the dentry is destroyed. + // + // readFD and writeFD may or may not be the same file descriptor. mmapFD is + // always either -1 or equal to readFD; if the file has been opened for + // writing, it is additionally either -1 or equal to writeFD. + handleMu sync.RWMutex `state:"nosave"` + readFD atomicbitops.Int32 `state:"nosave"` + writeFD atomicbitops.Int32 `state:"nosave"` + mmapFD atomicbitops.Int32 `state:"nosave"` + + // pf implements memmap.File for mappings of hostFD. + pf dentryPlatformFile + + // If this dentry represents a symbolic link, InteropModeShared is not in + // effect, and haveTarget is true, target is the symlink target. haveTarget + // and target are protected by dataMu. + haveTarget bool + target string + + // If this dentry represents a socket file, endpoint is the transport + // endpoint bound to this file. + // + // endpoint often originates from vfs.MknodOptions.Endpoint, in which case + // it can't be recovered if the dentry is evicted from the dentry cache. + // Consequently, an extra reference is held on dentries for which endpoint + // is non-nil to prevent eviction. + endpoint transport.BoundEndpoint + + // If this dentry represents a synthetic named pipe, pipe is the pipe + // endpoint bound to this file. + pipe *pipe.VFSPipe + + // impl is the specific inode implementation for non-synthetic dentries. + // impl is immutable. + // + // If impl is nil, this inode represents a synthetic file, i.e. a + // file that does not exist on the host filesystem. As of this writing, the + // only files that can be synthetic are sockets, pipes, and directories. + impl any // immutable +} + // dentry implements vfs.DentryImpl. // // +stateify savable @@ -812,9 +1004,6 @@ type dentry struct { // using atomic memory operations. refs atomicbitops.Int64 - // fs is the owning filesystem. fs is immutable. - fs *filesystem - // parent is this dentry's parent directory. Each dentry holds a reference // on its parent. If this dentry is a filesystem root, parent is nil. // parent is protected by filesystem.renameMu. @@ -825,10 +1014,12 @@ type dentry struct { // filesystem.renameMu. name string - // inoKey is used to identify this dentry's inode. - inoKey inoKey + // inode is the inode represented by this dentry. Multiple Dentries may + // share a single non-directory inode (with hard links). inode is + // immutable. + inode *inode - // If deleted is non-zero, the file represented by this dentry has been + // If deleted is non-zero, the file has been // deleted is accessed using atomic memory operations. deleted atomicbitops.Uint32 @@ -898,140 +1089,10 @@ type dentry struct { // +checklocks:childrenMu childrenSet map[string]struct{} `state:"nosave"` - // Cached metadata; protected by metadataMu. - // To access: - // - In situations where consistency is not required (like stat), these - // can be accessed using atomic operations only (without locking). - // - Lock metadataMu and can access without atomic operations. - // To mutate: - // - Lock metadataMu and use atomic operations to update because we might - // have atomic readers that don't hold the lock. - metadataMu sync.Mutex `state:"nosave"` - ino uint64 // immutable - mode atomicbitops.Uint32 // type is immutable, perms are mutable - uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic - gid atomicbitops.Uint32 // auth.KGID, but ... - blockSize atomicbitops.Uint32 // 0 if unknown - // Timestamps, all nsecs from the Unix epoch. - atime atomicbitops.Int64 - mtime atomicbitops.Int64 - ctime atomicbitops.Int64 - btime atomicbitops.Int64 - // File size, which differs from other metadata in two ways: - // - // - We make a best-effort attempt to keep it up to date even if - // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. - // - // - size is protected by both metadataMu and dataMu (i.e. both must be - // locked to mutate it; locking either is sufficient to access it). - size atomicbitops.Uint64 - // If this dentry does not represent a synthetic file, deleted is 0, and - // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the - // remote file's timestamps, which should be updated when this dentry is - // evicted. - atimeDirty atomicbitops.Uint32 - mtimeDirty atomicbitops.Uint32 - - // nlink counts the number of hard links to this dentry. It's updated and - // accessed using atomic operations. It's not protected by metadataMu like the - // other metadata fields. - nlink atomicbitops.Uint32 - - mapsMu sync.Mutex `state:"nosave"` - - // If this dentry represents a regular file, mappings tracks mappings of - // the file into memmap.MappingSpaces. mappings is protected by mapsMu. - mappings memmap.MappingSet - - // - If this dentry represents a regular file or directory, readFD (if not - // -1) is a host FD used for reads by all regularFileFDs/directoryFDs - // representing this dentry. - // - // - If this dentry represents a regular file, writeFD (if not -1) is a host - // FD used for writes by all regularFileFDs representing this dentry. - // - // - If this dentry represents a regular file, mmapFD is the host FD used - // for memory mappings. If mmapFD is -1, no such FD is available, and the - // internal page cache implementation is used for memory mappings instead. - // - // These fields are protected by handleMu. readFD, writeFD, and mmapFD are - // additionally written using atomic memory operations, allowing them to be - // read (albeit racily) with atomic.LoadInt32() without locking handleMu. - // - // readFD and writeFD may or may not be the same file descriptor. Once either - // transitions from closed (-1) to open, it may be mutated with handleMu - // locked, but cannot be closed until the dentry is destroyed. - // - // readFD and writeFD may or may not be the same file descriptor. mmapFD is - // always either -1 or equal to readFD; if the file has been opened for - // writing, it is additionally either -1 or equal to writeFD. - handleMu sync.RWMutex `state:"nosave"` - readFD atomicbitops.Int32 `state:"nosave"` - writeFD atomicbitops.Int32 `state:"nosave"` - mmapFD atomicbitops.Int32 `state:"nosave"` - - dataMu sync.RWMutex `state:"nosave"` - - // If this dentry represents a regular file that is client-cached, cache - // maps offsets into the cached file to offsets into - // filesystem.mfp.MemoryFile() that store the file's data. cache is - // protected by dataMu. - cache fsutil.FileRangeSet - - // If this dentry represents a regular file that is client-cached, dirty - // tracks dirty segments in cache. dirty is protected by dataMu. - dirty fsutil.DirtySet - - // If this dentry represents a deleted regular file, savedDeletedData is used - // to store file data for save/restore. - savedDeletedData []byte - - // pf implements memmap.File for mappings of hostFD. - pf dentryPlatformFile - - // If this dentry represents a symbolic link, InteropModeShared is not in - // effect, and haveTarget is true, target is the symlink target. haveTarget - // and target are protected by dataMu. - haveTarget bool - target string - - // If this dentry represents a socket file, endpoint is the transport - // endpoint bound to this file. - // - // endpoint often originates from vfs.MknodOptions.Endpoint, in which case - // it can't be recovered if the dentry is evicted from the dentry cache. - // Consequently, an extra reference is held on dentries for which endpoint - // is non-nil to prevent eviction. - endpoint transport.BoundEndpoint - - // If this dentry represents a synthetic named pipe, pipe is the pipe - // endpoint bound to this file. - pipe *pipe.VFSPipe - - locks vfs.FileLocks - - // Inotify watches for this dentry. - // - // Note that inotify may behave unexpectedly in the presence of hard links, - // because dentries corresponding to the same file have separate inotify - // watches when they should share the same set. This is the case because it is - // impossible for us to know for sure whether two dentries correspond to the - // same underlying file (see the gofer filesystem section fo vfs/inotify.md for - // a more in-depth discussion on this matter). - watches vfs.Watches - // forMountpoint marks directories that were created for mount points during // container startup. This is used during restore, in case these mount points // need to be recreated. forMountpoint bool - - // impl is the specific dentry implementation for non-synthetic dentries. - // impl is immutable. - // - // If impl is nil, this dentry represents a synthetic file, i.e. a - // file that does not exist on the host filesystem. As of this writing, the - // only files that can be synthetic are sockets, pipes, and directories. - impl any } // +stateify savable @@ -1066,7 +1127,7 @@ func (fs *filesystem) nextIno() uint64 { // init must be called before first use of d. func (d *dentry) init(impl any) { - d.pf.dentry = d + d.inode.pf.dentry = d d.cacheEntry.d = d d.syncableListEntry.d = d // Nested impl-inheritance pattern. In memory it looks like: @@ -1075,23 +1136,23 @@ func (d *dentry) init(impl any) { // making each outer dentry implementation hold the inner dentry by value. // Then the outer most dentry is allocated and we initialize fields inward. // Each inner dentry has a pointer to the next level of implementation. - d.impl = impl + d.inode.impl = impl d.vfsd.Init(d) refs.Register(d) } func (d *dentry) isSynthetic() bool { - return d.impl == nil + return d.inode.impl == nil } func (d *dentry) cachedMetadataAuthoritative() bool { - return d.fs.opts.interop != InteropModeShared || d.isSynthetic() + return d.inode.fs.opts.interop != InteropModeShared || d.isSynthetic() } // updateMetadataFromStatxLocked is called to update d's metadata after an update // from the remote filesystem. -// Precondition: d.metadataMu must be locked. -// +checklocks:d.metadataMu +// Precondition: d.inode.metadataMu must be locked. +// +checklocks:d.inode.metadataMu func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) { if stat.Mask&linux.STATX_TYPE != 0 { if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want { @@ -1099,33 +1160,33 @@ func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) { } } if stat.Mask&linux.STATX_MODE != 0 { - d.mode.Store(uint32(stat.Mode)) + d.inode.mode.Store(uint32(stat.Mode)) } if stat.Mask&linux.STATX_UID != 0 { - d.uid.Store(dentryUID(lisafs.UID(stat.UID))) + d.inode.uid.Store(dentryUID(lisafs.UID(stat.UID))) } if stat.Mask&linux.STATX_GID != 0 { - d.gid.Store(dentryGID(lisafs.GID(stat.GID))) + d.inode.gid.Store(dentryGID(lisafs.GID(stat.GID))) } if stat.Blksize != 0 { - d.blockSize.Store(stat.Blksize) + d.inode.blockSize.Store(stat.Blksize) } // Don't override newer client-defined timestamps with old server-defined // ones. - if stat.Mask&linux.STATX_ATIME != 0 && d.atimeDirty.Load() == 0 { - d.atime.Store(dentryTimestamp(stat.Atime)) + if stat.Mask&linux.STATX_ATIME != 0 && d.inode.atimeDirty.Load() == 0 { + d.inode.atime.Store(dentryTimestamp(stat.Atime)) } - if stat.Mask&linux.STATX_MTIME != 0 && d.mtimeDirty.Load() == 0 { - d.mtime.Store(dentryTimestamp(stat.Mtime)) + if stat.Mask&linux.STATX_MTIME != 0 && d.inode.mtimeDirty.Load() == 0 { + d.inode.mtime.Store(dentryTimestamp(stat.Mtime)) } if stat.Mask&linux.STATX_CTIME != 0 { - d.ctime.Store(dentryTimestamp(stat.Ctime)) + d.inode.ctime.Store(dentryTimestamp(stat.Ctime)) } if stat.Mask&linux.STATX_BTIME != 0 { - d.btime.Store(dentryTimestamp(stat.Btime)) + d.inode.btime.Store(dentryTimestamp(stat.Btime)) } if stat.Mask&linux.STATX_NLINK != 0 { - d.nlink.Store(stat.Nlink) + d.inode.nlink.Store(stat.Nlink) } if stat.Mask&linux.STATX_SIZE != 0 { d.updateSizeLocked(stat.Size) @@ -1134,39 +1195,39 @@ func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) { // updateMetadataFromStatLocked is similar to updateMetadataFromStatxLocked, // except that it takes a unix.Stat_t argument. -// Precondition: d.metadataMu must be locked. -// +checklocks:d.metadataMu +// Precondition: d.inode.metadataMu must be locked. +// +checklocks:d.inode.metadataMu func (d *directfsDentry) updateMetadataFromStatLocked(stat *unix.Stat_t) error { if got, want := stat.Mode&unix.S_IFMT, d.fileType(); got != want { panic(fmt.Sprintf("direct.dentry file type changed from %#o to %#o", want, got)) } - d.mode.Store(stat.Mode) - d.uid.Store(stat.Uid) - d.gid.Store(stat.Gid) - d.blockSize.Store(uint32(stat.Blksize)) + d.inode.mode.Store(stat.Mode) + d.inode.uid.Store(stat.Uid) + d.inode.gid.Store(stat.Gid) + d.inode.blockSize.Store(uint32(stat.Blksize)) // Don't override newer client-defined timestamps with old host-defined // ones. - if d.atimeDirty.Load() == 0 { - d.atime.Store(dentryTimestampFromUnix(stat.Atim)) + if d.inode.atimeDirty.Load() == 0 { + d.inode.atime.Store(dentryTimestampFromUnix(stat.Atim)) } - if d.mtimeDirty.Load() == 0 { - d.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) + if d.inode.mtimeDirty.Load() == 0 { + d.inode.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) } - d.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) - d.nlink.Store(uint32(stat.Nlink)) + d.inode.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) + d.inode.nlink.Store(uint32(stat.Nlink)) d.updateSizeLocked(uint64(stat.Size)) return nil } // Preconditions: !d.isSynthetic(). -// Preconditions: d.metadataMu is locked. -// +checklocks:d.metadataMu +// Preconditions: d.inode.metadataMu is locked. +// +checklocks:d.inode.metadataMu func (d *dentry) refreshSizeLocked(ctx context.Context) error { - d.handleMu.RLock() + d.inode.handleMu.RLock() // Can use RacyLoad() because handleMu is locked. - if d.writeFD.RacyLoad() < 0 { - d.handleMu.RUnlock() + if d.inode.writeFD.RacyLoad() < 0 { + d.inode.handleMu.RUnlock() // Use a suitable FD if we don't have a writable host FD. return d.updateMetadataLocked(ctx, noHandle) } @@ -1174,8 +1235,8 @@ func (d *dentry) refreshSizeLocked(ctx context.Context) error { // Using statx(2) with a minimal mask is faster than fstat(2). var stat unix.Statx_t // Can use RacyLoad() because handleMu is locked. - err := unix.Statx(int(d.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat) - d.handleMu.RUnlock() // must be released before updateSizeLocked() + err := unix.Statx(int(d.inode.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat) + d.inode.handleMu.RUnlock() // must be released before updateSizeLocked() if err != nil { return err } @@ -1185,21 +1246,21 @@ func (d *dentry) refreshSizeLocked(ctx context.Context) error { // Preconditions: !d.isSynthetic(). func (d *dentry) updateMetadata(ctx context.Context) error { - // d.metadataMu must be locked *before* we stat so that we do not end up + // d.inode.metadataMu must be locked *before* we stat so that we do not end up // updating stale attributes in d.updateMetadataFromStatLocked(). - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() return d.updateMetadataLocked(ctx, noHandle) } func (d *dentry) fileType() uint32 { - return d.mode.Load() & linux.S_IFMT + return d.inode.mode.Load() & linux.S_IFMT } func (d *dentry) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME - stat.Blksize = d.blockSize.Load() - stat.Nlink = d.nlink.Load() + stat.Blksize = d.inode.blockSize.Load() + stat.Nlink = d.inode.nlink.Load() if stat.Nlink == 0 { // The remote filesystem doesn't support link count; just make // something up. This is consistent with Linux, where @@ -1208,20 +1269,20 @@ func (d *dentry) statTo(stat *linux.Statx) { // it's not provided by the remote filesystem. stat.Nlink = 1 } - stat.UID = d.uid.Load() - stat.GID = d.gid.Load() - stat.Mode = uint16(d.mode.Load()) - stat.Ino = uint64(d.ino) - stat.Size = d.size.Load() + stat.UID = d.inode.uid.Load() + stat.GID = d.inode.gid.Load() + stat.Mode = uint16(d.inode.mode.Load()) + stat.Ino = uint64(d.inode.ino) + stat.Size = d.inode.size.Load() // This is consistent with regularFileFD.Seek(), which treats regular files // as having no holes. stat.Blocks = (stat.Size + 511) / 512 - stat.Atime = linux.NsecToStatxTimestamp(d.atime.Load()) - stat.Btime = linux.NsecToStatxTimestamp(d.btime.Load()) - stat.Ctime = linux.NsecToStatxTimestamp(d.ctime.Load()) - stat.Mtime = linux.NsecToStatxTimestamp(d.mtime.Load()) + stat.Atime = linux.NsecToStatxTimestamp(d.inode.atime.Load()) + stat.Btime = linux.NsecToStatxTimestamp(d.inode.btime.Load()) + stat.Ctime = linux.NsecToStatxTimestamp(d.inode.ctime.Load()) + stat.Mtime = linux.NsecToStatxTimestamp(d.inode.mtime.Load()) stat.DevMajor = linux.UNNAMED_MAJOR - stat.DevMinor = d.fs.devMinor + stat.DevMinor = d.inode.fs.devMinor } // Precondition: fs.renameMu is locked. @@ -1233,8 +1294,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return linuxerr.EPERM } - mode := linux.FileMode(d.mode.Load()) - if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil { + mode := linux.FileMode(d.inode.mode.Load()) + if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.inode.uid.Load()), auth.KGID(d.inode.gid.Load())); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { @@ -1266,7 +1327,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs } // Use client clocks for timestamps. - now = d.fs.clock.Now().Nanoseconds() + now = d.inode.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { stat.Atime = linux.NsecToStatxTimestamp(now) } @@ -1275,20 +1336,20 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs } } - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() // As with Linux, if the UID, GID, or file size is changing, we have to // clear permission bits. Note that when set, clearSGID may cause // permissions to be updated. - clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != d.uid.Load()) || - (stat.Mask&linux.STATX_GID != 0 && stat.GID != d.gid.Load()) || + clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != d.inode.uid.Load()) || + (stat.Mask&linux.STATX_GID != 0 && stat.GID != d.inode.gid.Load()) || stat.Mask&linux.STATX_SIZE != 0 if clearSGID { if stat.Mask&linux.STATX_MODE != 0 { stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode))) } else { - oldMode := d.mode.Load() + oldMode := d.inode.mode.Load() if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode { stat.Mode = uint16(updatedMode) stat.Mask |= linux.STATX_MODE @@ -1308,35 +1369,35 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs if err := d.prepareSetStat(ctx, stat); err != nil { return err } - d.handleMu.RLock() + d.inode.handleMu.RLock() if stat.Mask&linux.STATX_SIZE != 0 { - // d.dataMu must be held around the update to both the remote - // file's size and d.size to serialize with writeback (which - // might otherwise write data back up to the old d.size after + // d.inode.dataMu must be held around the update to both the remote + // file's size and d.inode.size to serialize with writeback (which + // might otherwise write data back up to the old d.inode.size after // the remote file has been truncated). - d.dataMu.Lock() + d.inode.dataMu.Lock() } var err error failureMask, failureErr, err = d.setStatLocked(ctx, stat) - d.handleMu.RUnlock() + d.inode.handleMu.RUnlock() if err != nil { if stat.Mask&linux.STATX_SIZE != 0 { - d.dataMu.Unlock() // +checklocksforce: locked conditionally above + d.inode.dataMu.Unlock() // +checklocksforce: locked conditionally above } return err } if stat.Mask&linux.STATX_SIZE != 0 { if failureMask&linux.STATX_SIZE == 0 { - // d.size should be kept up to date, and privatized + // d.inode.size should be kept up to date, and privatized // copy-on-write mappings of truncated pages need to be // invalidated, even if InteropModeShared is in effect. d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above } else { - d.dataMu.Unlock() // +checklocksforce: locked conditionally above + d.inode.dataMu.Unlock() // +checklocksforce: locked conditionally above } } } - if d.fs.opts.interop == InteropModeShared { + if d.inode.fs.opts.interop == InteropModeShared { // There's no point to updating d's metadata in this case since // it'll be overwritten by revalidation before the next time it's // used anyway. (InteropModeShared inhibits client caching of @@ -1345,13 +1406,13 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs } } if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 { - d.mode.Store(d.fileType() | uint32(stat.Mode)) + d.inode.mode.Store(d.fileType() | uint32(stat.Mode)) } if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 { - d.uid.Store(stat.UID) + d.inode.uid.Store(stat.UID) } if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 { - d.gid.Store(stat.GID) + d.inode.gid.Store(stat.GID) } // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because // if d.cachedMetadataAuthoritative() then we converted stat.Atime and @@ -1359,14 +1420,14 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs // !d.cachedMetadataAuthoritative() then we returned after calling // d.file.setAttr(). For the same reason, now must have been initialized. if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 { - d.atime.Store(stat.Atime.ToNsec()) - d.atimeDirty.Store(0) + d.inode.atime.Store(stat.Atime.ToNsec()) + d.inode.atimeDirty.Store(0) } if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 { - d.mtime.Store(stat.Mtime.ToNsec()) - d.mtimeDirty.Store(0) + d.inode.mtime.Store(stat.Mtime.ToNsec()) + d.inode.mtimeDirty.Store(0) } - d.ctime.Store(now) + d.inode.ctime.Store(now) if failureMask != 0 { // Setting some attribute failed on the remote filesystem. return failureErr @@ -1374,15 +1435,15 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } -// doAllocate performs an allocate operation on d. Note that d.metadataMu will +// doAllocate performs an allocate operation on d. Note that d.inode.metadataMu will // be held when allocate is called. func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() // Allocating a smaller size is a noop. size := offset + length - if d.cachedMetadataAuthoritative() && size <= d.size.RacyLoad() { + if d.cachedMetadataAuthoritative() && size <= d.inode.size.RacyLoad() { return nil } @@ -1397,51 +1458,51 @@ func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate return nil } -// Preconditions: d.metadataMu must be locked. +// Preconditions: d.inode.metadataMu must be locked. func (d *dentry) updateSizeLocked(newSize uint64) { - d.dataMu.Lock() + d.inode.dataMu.Lock() d.updateSizeAndUnlockDataMuLocked(newSize) } -// Preconditions: d.metadataMu and d.dataMu must be locked. +// Preconditions: d.inode.metadataMu and d.inode.dataMu must be locked. // -// Postconditions: d.dataMu is unlocked. -// +checklocksrelease:d.dataMu +// Postconditions: d.inode.dataMu is unlocked. +// +checklocksrelease:d.inode.dataMu func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) { - oldSize := d.size.RacyLoad() - d.size.Store(newSize) - // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings + oldSize := d.inode.size.RacyLoad() + d.inode.size.Store(newSize) + // d.inode.dataMu must be unlocked to lock d.inode.mapsMu and invalidate mappings // below. This allows concurrent calls to Read/Translate/etc. These // functions synchronize with truncation by refusing to use cache - // contents beyond the new d.size. (We are still holding d.metadataMu, + // contents beyond the new d.inode.size. (We are still holding d.inode.metadataMu, // so we can't race with Write or another truncate.) - d.dataMu.Unlock() + d.inode.dataMu.Unlock() if newSize < oldSize { oldpgend, _ := hostarch.PageRoundUp(oldSize) newpgend, _ := hostarch.PageRoundUp(newSize) if oldpgend != newpgend { - d.mapsMu.Lock() - d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + d.inode.mapsMu.Lock() + d.inode.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ // Compare Linux's mm/truncate.c:truncate_setsize() => // truncate_pagecache() => // mm/memory.c:unmap_mapping_range(evencows=1). InvalidatePrivate: true, }) - d.mapsMu.Unlock() + d.inode.mapsMu.Unlock() } // We are now guaranteed that there are no translations of // truncated pages, and can remove them from the cache. Since // truncated pages have been removed from the remote file, they // should be dropped without being written back. - d.dataMu.Lock() - d.cache.Truncate(newSize, d.fs.mf) - d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend}) - d.dataMu.Unlock() + d.inode.dataMu.Lock() + d.inode.cache.Truncate(newSize, d.inode.fs.mf) + d.inode.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend}) + d.inode.dataMu.Unlock() } } func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.inode.mode.Load()), auth.KUID(d.inode.uid.Load()), auth.KGID(d.inode.gid.Load())) } func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { @@ -1456,13 +1517,9 @@ func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats if strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { return linuxerr.EOPNOTSUPP } - // Do not allow writes to the "security" namespace on the host filesystem. - if ats.MayWrite() && strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) { - return linuxerr.EOPNOTSUPP - } - mode := linux.FileMode(d.mode.Load()) - kuid := auth.KUID(d.uid.Load()) - kgid := auth.KGID(d.gid.Load()) + mode := linux.FileMode(d.inode.mode.Load()) + kuid := auth.KUID(d.inode.uid.Load()) + kgid := auth.KGID(d.inode.gid.Load()) if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { return err } @@ -1472,10 +1529,10 @@ func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { return vfs.CheckDeleteSticky( creds, - linux.FileMode(d.mode.Load()), - auth.KUID(d.uid.Load()), - auth.KUID(child.uid.Load()), - auth.KGID(child.gid.Load()), + linux.FileMode(d.inode.mode.Load()), + auth.KUID(d.inode.uid.Load()), + auth.KUID(child.inode.uid.Load()), + auth.KGID(child.inode.gid.Load()), ) } @@ -1495,7 +1552,7 @@ func dentryGID(gid lisafs.GID) uint32 { // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { - // d.refs may be 0 if d.fs.renameMu is locked, which serializes against + // d.refs may be 0 if d.inode.fs.renameMu is locked, which serializes against // d.checkCachingLocked(). r := d.refs.Add(1) if d.LogRefs() { @@ -1564,18 +1621,18 @@ func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, e events |= linux.IN_ISDIR } - d.fs.ancestryMu.RLock() + d.inode.fs.ancestryMu.RLock() // The ordering below is important, Linux always notifies the parent first. if parent := d.parent.Load(); parent != nil { - parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) + parent.inode.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) } - d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) - d.fs.ancestryMu.RUnlock() + d.inode.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) + d.inode.fs.ancestryMu.RUnlock() } // Watches implements vfs.DentryImpl.Watches. func (d *dentry) Watches() *vfs.Watches { - return &d.watches + return &d.inode.watches } // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. @@ -1599,7 +1656,7 @@ func (d *dentry) OnZeroWatches(ctx context.Context) { // operation. One of the calls may destroy the dentry, so subsequent calls will // do nothing. // -// Preconditions: d.fs.renameMu must be locked for writing if +// Preconditions: d.inode.fs.renameMu must be locked for writing if // renameMuWriteLocked is true; it may be temporarily unlocked. func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { d.cachingMu.Lock() @@ -1626,9 +1683,9 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo d.removeFromCacheLocked() d.cachingMu.Unlock() if !renameMuWriteLocked { - // Need to lock d.fs.renameMu for writing as needed by d.destroyLocked(). - d.fs.renameMu.Lock() - defer d.fs.renameMu.Unlock() + // Need to lock d.inode.fs.renameMu for writing as needed by d.destroyLocked(). + d.inode.fs.renameMu.Lock() + defer d.inode.fs.renameMu.Unlock() // Now that renameMu is locked for writing, no more refs can be taken on // d because path resolution requires renameMu for reading at least. if d.refs.Load() != 0 { @@ -1638,7 +1695,7 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo } } if d.isDeleted() { - d.watches.HandleDeletion(ctx) + d.inode.watches.HandleDeletion(ctx) } d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. return @@ -1656,22 +1713,22 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo // If d still has inotify watches and it is not deleted or invalidated, it // can't be evicted. Otherwise, we will lose its watches, even if a new // dentry is created for the same file in the future. Note that the size of - // d.watches cannot concurrently transition from zero to non-zero, because + // d.inode.watches cannot concurrently transition from zero to non-zero, because // adding a watch requires holding a reference on d. - if d.watches.Size() > 0 { + if d.inode.watches.Size() > 0 { // As in the refs > 0 case, removing d is beneficial. d.removeFromCacheLocked() d.cachingMu.Unlock() return } - if d.fs.released.Load() != 0 { + if d.inode.fs.released.Load() != 0 { d.cachingMu.Unlock() if !renameMuWriteLocked { - // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as + // Need to lock d.inode.fs.renameMu to access d.parent. Lock it for writing as // needed by d.destroyLocked() later. - d.fs.renameMu.Lock() - defer d.fs.renameMu.Unlock() + d.inode.fs.renameMu.Lock() + defer d.inode.fs.renameMu.Unlock() } if parent := d.parent.Load(); parent != nil { parent.childrenMu.Lock() @@ -1682,42 +1739,42 @@ func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked boo return } - d.fs.dentryCache.mu.Lock() + d.inode.fs.dentryCache.mu.Lock() // If d is already cached, just move it to the front of the LRU. if d.cached { - d.fs.dentryCache.dentries.Remove(&d.cacheEntry) - d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) - d.fs.dentryCache.mu.Unlock() + d.inode.fs.dentryCache.dentries.Remove(&d.cacheEntry) + d.inode.fs.dentryCache.dentries.PushFront(&d.cacheEntry) + d.inode.fs.dentryCache.mu.Unlock() d.cachingMu.Unlock() return } // Cache the dentry, then evict the least recently used cached dentry if // the cache becomes over-full. - d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) - d.fs.dentryCache.dentriesLen++ + d.inode.fs.dentryCache.dentries.PushFront(&d.cacheEntry) + d.inode.fs.dentryCache.dentriesLen++ d.cached = true - shouldEvict := d.fs.dentryCache.dentriesLen > d.fs.dentryCache.maxCachedDentries - d.fs.dentryCache.mu.Unlock() + shouldEvict := d.inode.fs.dentryCache.dentriesLen > d.inode.fs.dentryCache.maxCachedDentries + d.inode.fs.dentryCache.mu.Unlock() d.cachingMu.Unlock() if shouldEvict { if !renameMuWriteLocked { - // Need to lock d.fs.renameMu for writing as needed by + // Need to lock d.inode.fs.renameMu for writing as needed by // d.evictCachedDentryLocked(). - d.fs.renameMu.Lock() - defer d.fs.renameMu.Unlock() + d.inode.fs.renameMu.Lock() + defer d.inode.fs.renameMu.Unlock() } - d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. + d.inode.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. } } // Preconditions: d.cachingMu must be locked. func (d *dentry) removeFromCacheLocked() { if d.cached { - d.fs.dentryCache.mu.Lock() - d.fs.dentryCache.dentries.Remove(&d.cacheEntry) - d.fs.dentryCache.dentriesLen-- - d.fs.dentryCache.mu.Unlock() + d.inode.fs.dentryCache.mu.Lock() + d.inode.fs.dentryCache.dentries.Remove(&d.cacheEntry) + d.inode.fs.dentryCache.dentriesLen-- + d.inode.fs.dentryCache.mu.Unlock() d.cached = false } } @@ -1745,7 +1802,7 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { return } - if victim.d.fs == fs { + if victim.d.inode.fs == fs { victim.d.evictLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs return } @@ -1760,23 +1817,23 @@ func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { } // Preconditions: -// - d.fs.renameMu must not be locked for writing. +// - d.inode.fs.renameMu must not be locked for writing. func (d *dentry) evict(ctx context.Context) { - d.fs.renameMu.Lock() - defer d.fs.renameMu.Unlock() + d.inode.fs.renameMu.Lock() + defer d.inode.fs.renameMu.Unlock() d.evictLocked(ctx) } // Preconditions: -// - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. +// - d.inode.fs.renameMu must be locked for writing; it may be temporarily unlocked. // -// +checklocks:d.fs.renameMu +// +checklocks:d.inode.fs.renameMu func (d *dentry) evictLocked(ctx context.Context) { d.cachingMu.Lock() d.removeFromCacheLocked() - // d.refs or d.watches.Size() may have become non-zero from an earlier path + // d.refs or d.inode.watches.Size() may have become non-zero from an earlier path // resolution since it was inserted into fs.dentryCache.dentries. - if d.refs.Load() != 0 || d.watches.Size() != 0 { + if d.refs.Load() != 0 || d.inode.watches.Size() != 0 { d.cachingMu.Unlock() return } @@ -1785,7 +1842,7 @@ func (d *dentry) evictLocked(ctx context.Context) { if !d.vfsd.IsDead() { // Note that d can't be a mount point (in any mount namespace), since VFS // holds references on mount points. - rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd) + rcs := d.inode.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd) for _, rc := range rcs { rc.DecRef(ctx) } @@ -1811,43 +1868,43 @@ func (d *dentry) evictLocked(ctx context.Context) { // destroyDisconnected destroys an uncached, unparented dentry. There are no // locking preconditions. func (d *dentry) destroyDisconnected(ctx context.Context) { - mf := d.fs.mf + mf := d.inode.fs.mf - d.handleMu.Lock() - d.dataMu.Lock() + d.inode.handleMu.Lock() + d.inode.dataMu.Lock() if d.isWriteHandleOk() { // Write dirty pages back to the remote filesystem. h := d.writeHandle() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirtyAll(ctx, &d.inode.cache, &d.inode.dirty, d.inode.size.Load(), mf, h.writeFromBlocksAt); err != nil { log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) } } // Discard cached data. - if !d.cache.IsEmpty() { + if !d.inode.cache.IsEmpty() { mf.MarkAllUnevictable(d) - d.cache.DropAll(mf) - d.dirty.RemoveAll() + d.inode.cache.DropAll(mf) + d.inode.dirty.RemoveAll() } - d.dataMu.Unlock() + d.inode.dataMu.Unlock() // Close any resources held by the implementation. d.destroyImpl(ctx) // Can use RacyLoad() because handleMu is locked. - if d.readFD.RacyLoad() >= 0 { - _ = unix.Close(int(d.readFD.RacyLoad())) + if d.inode.readFD.RacyLoad() >= 0 { + _ = unix.Close(int(d.inode.readFD.RacyLoad())) } - if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { - _ = unix.Close(int(d.writeFD.RacyLoad())) + if d.inode.writeFD.RacyLoad() >= 0 && d.inode.readFD.RacyLoad() != d.inode.writeFD.RacyLoad() { + _ = unix.Close(int(d.inode.writeFD.RacyLoad())) } - d.readFD = atomicbitops.FromInt32(-1) - d.writeFD = atomicbitops.FromInt32(-1) - d.mmapFD = atomicbitops.FromInt32(-1) - d.handleMu.Unlock() + d.inode.readFD.Store(-1) + d.inode.writeFD.Store(-1) + d.inode.mmapFD.Store(-1) + d.inode.handleMu.Unlock() if !d.isSynthetic() { - // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, + // Note that it's possible that d.inode.atimeDirty or d.inode.mtimeDirty are true, // i.e. client and server timestamps may differ (because e.g. a client // write was serviced by the page cache, and only written back to the // remote file later). Ideally, we'd write client timestamps back to @@ -1857,9 +1914,9 @@ func (d *dentry) destroyDisconnected(ctx context.Context) { // don't do this. // Remove d from the set of syncable dentries. - d.fs.syncMu.Lock() - d.fs.syncableDentries.Remove(&d.syncableListEntry) - d.fs.syncMu.Unlock() + d.inode.fs.syncMu.Lock() + d.inode.fs.syncableDentries.Remove(&d.syncableListEntry) + d.inode.fs.syncMu.Unlock() } // Drop references and stop tracking this child. @@ -1870,12 +1927,12 @@ func (d *dentry) destroyDisconnected(ctx context.Context) { // destroyLocked destroys the dentry. // // Preconditions: -// - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. +// - d.inode.fs.renameMu must be locked for writing; it may be temporarily unlocked. // - d.refs == 0. // - d.parent.children[d.name] != d, i.e. d is not reachable by path traversal // from its former parent dentry. // -// +checklocks:d.fs.renameMu +// +checklocks:d.inode.fs.renameMu func (d *dentry) destroyLocked(ctx context.Context) { switch d.refs.Load() { case 0: @@ -1889,15 +1946,15 @@ func (d *dentry) destroyLocked(ctx context.Context) { // Allow the following to proceed without renameMu locked to improve // scalability. - d.fs.renameMu.Unlock() + d.inode.fs.renameMu.Unlock() // No locks need to be held during destoryDisconnected. d.destroyDisconnected(ctx) - d.fs.renameMu.Lock() + d.inode.fs.renameMu.Lock() // Drop the reference held by d on its parent without recursively locking - // d.fs.renameMu. + // d.inode.fs.renameMu. if parent := d.parent.Load(); parent != nil && parent.decRefNoCaching() == 0 { parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) @@ -1958,19 +2015,19 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). if !trunc { - d.handleMu.RLock() + d.inode.handleMu.RLock() canReuseCurHandle := (!read || d.isReadHandleOk()) && (!write || d.isWriteHandleOk()) - d.handleMu.RUnlock() + d.inode.handleMu.RUnlock() if canReuseCurHandle { // Current handles are sufficient. return nil } } - d.handleMu.Lock() + d.inode.handleMu.Lock() needNewHandle := (read && !d.isReadHandleOk()) || (write && !d.isWriteHandleOk()) || trunc if !needNewHandle { - d.handleMu.Unlock() + d.inode.handleMu.Unlock() return nil } @@ -2000,61 +2057,61 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool h, err = d.openHandle(ctx, openReadable, openWritable, trunc) } if err != nil { - d.handleMu.Unlock() + d.inode.handleMu.Unlock() return err } - // Update d.readFD and d.writeFD + // Update d.inode.readFD and d.inode.writeFD if h.fd >= 0 { - if openReadable && openWritable && (d.readFD.RacyLoad() < 0 || d.writeFD.RacyLoad() < 0 || d.readFD.RacyLoad() != d.writeFD.RacyLoad()) { + if openReadable && openWritable && (d.inode.readFD.RacyLoad() < 0 || d.inode.writeFD.RacyLoad() < 0 || d.inode.readFD.RacyLoad() != d.inode.writeFD.RacyLoad()) { // Replace existing FDs with this one. - if d.readFD.RacyLoad() >= 0 { + if d.inode.readFD.RacyLoad() >= 0 { // We already have a readable FD that may be in use by - // concurrent callers of d.pf.FD(). - if d.fs.opts.overlayfsStaleRead { + // concurrent callers of d.inode.pf.FD(). + if d.inode.fs.opts.overlayfsStaleRead { // If overlayfsStaleRead is in effect, then the new FD // may not be coherent with the existing one, so we // have no choice but to switch to mappings of the new // FD in both the application and sentry. - if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { - d.handleMu.Unlock() + if err := d.inode.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { + d.inode.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) h.close(ctx) return err } - fdsToClose = append(fdsToClose, d.readFD.RacyLoad()) + fdsToClose = append(fdsToClose, d.inode.readFD.RacyLoad()) invalidateTranslations = true - d.readFD.Store(h.fd) + d.inode.readFD.Store(h.fd) } else { // Otherwise, we want to avoid invalidating existing // memmap.Translations (which is expensive); instead, use // dup3 to make the old file descriptor refer to the new // file description, then close the new file descriptor - // (which is no longer needed). Racing callers of d.pf.FD() + // (which is no longer needed). Racing callers of d.inode.pf.FD() // may use the old or new file description, but this // doesn't matter since they refer to the same file, and // any racing mappings must be read-only. - if err := unix.Dup3(int(h.fd), int(d.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil { - oldFD := d.readFD.RacyLoad() - d.handleMu.Unlock() + if err := unix.Dup3(int(h.fd), int(d.inode.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil { + oldFD := d.inode.readFD.RacyLoad() + d.inode.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err) h.close(ctx) return err } fdsToClose = append(fdsToClose, h.fd) - h.fd = d.readFD.RacyLoad() + h.fd = d.inode.readFD.RacyLoad() } } else { - d.readFD.Store(h.fd) + d.inode.readFD.Store(h.fd) } - if d.writeFD.RacyLoad() != h.fd && d.writeFD.RacyLoad() >= 0 { - fdsToClose = append(fdsToClose, d.writeFD.RacyLoad()) + if d.inode.writeFD.RacyLoad() != h.fd && d.inode.writeFD.RacyLoad() >= 0 { + fdsToClose = append(fdsToClose, d.inode.writeFD.RacyLoad()) } - d.writeFD.Store(h.fd) - d.mmapFD.Store(h.fd) - } else if openReadable && d.readFD.RacyLoad() < 0 { + d.inode.writeFD.Store(h.fd) + d.inode.mmapFD.Store(h.fd) + } else if openReadable && d.inode.readFD.RacyLoad() < 0 { readHandleWasOk := d.isReadHandleOk() - d.readFD.Store(h.fd) + d.inode.readFD.Store(h.fd) // If the file has not been opened for writing, the new FD may // be used for read-only memory mappings. If the file was // previously opened for reading (without an FD), then existing @@ -2062,11 +2119,11 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // invalidate those mappings. if !d.isWriteHandleOk() { invalidateTranslations = readHandleWasOk - d.mmapFD.Store(h.fd) + d.inode.mmapFD.Store(h.fd) } - } else if openWritable && d.writeFD.RacyLoad() < 0 { - d.writeFD.Store(h.fd) - if d.readFD.RacyLoad() >= 0 { + } else if openWritable && d.inode.writeFD.RacyLoad() < 0 { + d.inode.writeFD.Store(h.fd) + if d.inode.readFD.RacyLoad() >= 0 { // We have an existing read-only FD, but the file has just // been opened for writing, so we need to start supporting // writable memory mappings. However, the new FD is not @@ -2074,32 +2131,32 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // writable memory mappings. Switch to using the internal // page cache. invalidateTranslations = true - d.mmapFD.Store(-1) + d.inode.mmapFD.Store(-1) } } else { // The new FD is not useful. fdsToClose = append(fdsToClose, h.fd) } - } else if openWritable && d.writeFD.RacyLoad() < 0 && d.mmapFD.RacyLoad() >= 0 { + } else if openWritable && d.inode.writeFD.RacyLoad() < 0 && d.inode.mmapFD.RacyLoad() >= 0 { // We have an existing read-only FD, but the file has just been // opened for writing, so we need to start supporting writable // memory mappings. However, we have no writable host FD. Switch to // using the internal page cache. invalidateTranslations = true - d.mmapFD.Store(-1) + d.inode.mmapFD.Store(-1) } d.updateHandles(ctx, h, openReadable, openWritable) - d.handleMu.Unlock() + d.inode.handleMu.Unlock() if invalidateTranslations { // Invalidate application mappings that may be using an old FD; they // will be replaced with mappings using the new FD after future calls - // to d.Translate(). This requires holding d.mapsMu, which precedes - // d.handleMu in the lock order. - d.mapsMu.Lock() - d.mappings.InvalidateAll(memmap.InvalidateOpts{}) - d.mapsMu.Unlock() + // to d.Translate(). This requires holding d.inode.mapsMu, which precedes + // d.inode.handleMu in the lock order. + d.inode.mapsMu.Lock() + d.inode.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.inode.mapsMu.Unlock() } for _, fd := range fdsToClose { unix.Close(int(fd)) @@ -2109,12 +2166,12 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool } func (d *dentry) syncRemoteFile(ctx context.Context) error { - d.handleMu.RLock() - defer d.handleMu.RUnlock() + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() return d.syncRemoteFileLocked(ctx) } -// Preconditions: d.handleMu must be locked. +// Preconditions: d.inode.handleMu must be locked. func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { // Prefer syncing write handles over read handles, since some remote // filesystem implementations may not sync changes made through write @@ -2127,14 +2184,14 @@ func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { } func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { - d.handleMu.RLock() - defer d.handleMu.RUnlock() + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() if d.isWriteHandleOk() { // Write back dirty pages to the remote file. - d.dataMu.Lock() + d.inode.dataMu.Lock() h := d.writeHandle() - err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), d.fs.mf, h.writeFromBlocksAt) - d.dataMu.Unlock() + err := fsutil.SyncDirtyAll(ctx, &d.inode.cache, &d.inode.dirty, d.inode.size.Load(), d.inode.fs.mf, h.writeFromBlocksAt) + d.inode.dataMu.Unlock() if err != nil { return err } @@ -2155,20 +2212,20 @@ func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) err // incLinks increments link count. func (d *dentry) incLinks() { - if d.nlink.Load() == 0 { + if d.inode.nlink.Load() == 0 { // The remote filesystem doesn't support link count. return } - d.nlink.Add(1) + d.inode.nlink.Add(1) } // decLinks decrements link count. func (d *dentry) decLinks() { - if d.nlink.Load() == 0 { + if d.inode.nlink.Load() == 0 { // The remote filesystem doesn't support link count. return } - d.nlink.Add(^uint32(0)) + d.inode.nlink.Add(^uint32(0)) } // fileDescription is embedded by gofer implementations of diff --git a/pkg/sentry/fsimpl/gofer/lisafs_dentry.go b/pkg/sentry/fsimpl/gofer/lisafs_dentry.go index d7c5699453..40ccb19210 100644 --- a/pkg/sentry/fsimpl/gofer/lisafs_dentry.go +++ b/pkg/sentry/fsimpl/gofer/lisafs_dentry.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" - "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/lisafs" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -111,59 +110,55 @@ func (fs *filesystem) newLisafsDentry(ctx context.Context, ino *lisafs.Inode) (* } inoKey := inoKeyFromStatx(&ino.Stat) + inode, err := fs.createOrFindInodeStatx(inoKey, &ino.Stat) + if inode == nil { + ctx.Warningf("could not create or find inode") + return nil, err + } d := &lisafsDentry{ dentry: dentry{ - fs: fs, - inoKey: inoKey, - ino: fs.inoFromKey(inoKey), - mode: atomicbitops.FromUint32(uint32(ino.Stat.Mode)), - uid: atomicbitops.FromUint32(uint32(fs.opts.dfltuid)), - gid: atomicbitops.FromUint32(uint32(fs.opts.dfltgid)), - blockSize: atomicbitops.FromUint32(hostarch.PageSize), - readFD: atomicbitops.FromInt32(-1), - writeFD: atomicbitops.FromInt32(-1), - mmapFD: atomicbitops.FromInt32(-1), + inode: inode, }, controlFD: fs.client.NewFD(ino.ControlFD), } if ino.Stat.Mask&linux.STATX_UID != 0 { - d.uid = atomicbitops.FromUint32(dentryUID(lisafs.UID(ino.Stat.UID))) + d.inode.uid = atomicbitops.FromUint32(dentryUID(lisafs.UID(ino.Stat.UID))) } if ino.Stat.Mask&linux.STATX_GID != 0 { - d.gid = atomicbitops.FromUint32(dentryGID(lisafs.GID(ino.Stat.GID))) + d.inode.gid = atomicbitops.FromUint32(dentryGID(lisafs.GID(ino.Stat.GID))) } if ino.Stat.Mask&linux.STATX_SIZE != 0 { - d.size = atomicbitops.FromUint64(ino.Stat.Size) + d.inode.size = atomicbitops.FromUint64(ino.Stat.Size) } if ino.Stat.Blksize != 0 { - d.blockSize = atomicbitops.FromUint32(ino.Stat.Blksize) + d.inode.blockSize = atomicbitops.FromUint32(ino.Stat.Blksize) } if ino.Stat.Mask&linux.STATX_ATIME != 0 { - d.atime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Atime)) + d.inode.atime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Atime)) } else { - d.atime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) + d.inode.atime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) } if ino.Stat.Mask&linux.STATX_MTIME != 0 { - d.mtime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Mtime)) + d.inode.mtime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Mtime)) } else { - d.mtime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) + d.inode.mtime = atomicbitops.FromInt64(fs.clock.Now().Nanoseconds()) } if ino.Stat.Mask&linux.STATX_CTIME != 0 { - d.ctime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Ctime)) + d.inode.ctime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Ctime)) } else { // Approximate ctime with mtime if ctime isn't available. - d.ctime = atomicbitops.FromInt64(d.mtime.Load()) + d.inode.ctime = atomicbitops.FromInt64(d.inode.mtime.Load()) } if ino.Stat.Mask&linux.STATX_BTIME != 0 { - d.btime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Btime)) + d.inode.btime = atomicbitops.FromInt64(dentryTimestamp(ino.Stat.Btime)) } if ino.Stat.Mask&linux.STATX_NLINK != 0 { - d.nlink = atomicbitops.FromUint32(ino.Stat.Nlink) + d.inode.nlink = atomicbitops.FromUint32(ino.Stat.Nlink) } else { if ino.Stat.Mode&linux.FileTypeMask == linux.ModeDirectory { - d.nlink = atomicbitops.FromUint32(2) + d.inode.nlink = atomicbitops.FromUint32(2) } else { - d.nlink = atomicbitops.FromUint32(1) + d.inode.nlink = atomicbitops.FromUint32(1) } } d.dentry.init(d) @@ -198,18 +193,18 @@ func (d *lisafsDentry) updateHandles(ctx context.Context, h handle, readable, wr d.writeFDLisa = h.fdLisa } // NOTE(b/141991141): Close old FDs before making new fids visible (by - // unlocking d.handleMu). + // unlocking d.inode.handleMu). if oldReadFD.Ok() { - d.fs.client.CloseFD(ctx, oldReadFD, false /* flush */) + d.inode.fs.client.CloseFD(ctx, oldReadFD, false /* flush */) } if oldWriteFD.Ok() && oldReadFD != oldWriteFD { - d.fs.client.CloseFD(ctx, oldWriteFD, false /* flush */) + d.inode.fs.client.CloseFD(ctx, oldWriteFD, false /* flush */) } } -// Precondition: d.metadataMu must be locked. +// Precondition: d.inode.metadataMu must be locked. // -// +checklocks:d.metadataMu +// +checklocks:d.inode.metadataMu func (d *lisafsDentry) updateMetadataLocked(ctx context.Context, h handle) error { handleMuRLocked := false if !h.fdLisa.Ok() { @@ -218,7 +213,7 @@ func (d *lisafsDentry) updateMetadataLocked(ctx context.Context, h handle) error // readable one since some filesystem implementations may update a writable // FD's metadata after writes, without making metadata updates immediately // visible to read-only FDs representing the same file. - d.handleMu.RLock() + d.inode.handleMu.RLock() switch { case d.writeFDLisa.Ok(): h.fdLisa = d.writeFDLisa @@ -228,7 +223,7 @@ func (d *lisafsDentry) updateMetadataLocked(ctx context.Context, h handle) error handleMuRLocked = true default: h.fdLisa = d.controlFD - d.handleMu.RUnlock() + d.inode.handleMu.RUnlock() } } @@ -236,7 +231,7 @@ func (d *lisafsDentry) updateMetadataLocked(ctx context.Context, h handle) error err := h.fdLisa.StatTo(ctx, &stat) if handleMuRLocked { // handleMu must be released before updateMetadataFromStatLocked(). - d.handleMu.RUnlock() // +checklocksforce: complex case. + d.inode.handleMu.RUnlock() // +checklocksforce: complex case. } if err != nil { return err @@ -279,7 +274,7 @@ func (d *lisafsDentry) getRemoteChild(ctx context.Context, name string) (*dentry if err != nil { return nil, err } - return d.fs.newLisafsDentry(ctx, &childInode) + return d.inode.fs.newLisafsDentry(ctx, &childInode) } // Preconditions: @@ -336,7 +331,7 @@ func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp r var dentryCreationErr error for i := range inodes { if dentryCreationErr != nil { - d.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) + d.inode.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) continue } @@ -346,11 +341,11 @@ func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp r if ok && child != nil { // We raced. Clean up the new inode and proceed with // the cached child. - d.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) + d.inode.fs.client.CloseFD(ctx, inodes[i].ControlFD, false /* flush */) } else { // Create and cache the new dentry. var err error - child, err = d.fs.newLisafsDentry(ctx, &inodes[i]) + child, err = d.inode.fs.newLisafsDentry(ctx, &inodes[i]) if err != nil { dentryCreationErr = err curParentUnlock() @@ -378,7 +373,7 @@ func (d *lisafsDentry) getRemoteChildAndWalkPathLocked(ctx context.Context, rp r } func (d *lisafsDentry) newChildDentry(ctx context.Context, childIno *lisafs.Inode, childName string) (*dentry, error) { - child, err := d.fs.newLisafsDentry(ctx, childIno) + child, err := d.inode.fs.newLisafsDentry(ctx, childIno) if err != nil { if err := d.controlFD.UnlinkAt(ctx, childName, 0 /* flags */); err != nil { log.Warningf("failed to clean up created child %s after newLisafsDentry() failed: %v", childName, err) @@ -407,7 +402,7 @@ func (d *lisafsDentry) mknod(ctx context.Context, name string, creds *auth.Crede if err := d.controlFD.UnlinkAt(ctx, name, 0 /* flags */); err != nil { log.Warningf("failed to clean up socket which was created by BindAt RPC: %v", err) } - d.fs.client.CloseFD(ctx, childInode.ControlFD, false /* flush */) + d.inode.fs.client.CloseFD(ctx, childInode.ControlFD, false /* flush */) return nil, err } child, err := d.newChildDentry(ctx, &childInode, name) @@ -417,7 +412,7 @@ func (d *lisafsDentry) mknod(ctx context.Context, name string, creds *auth.Crede } // Set the endpoint on the newly created child dentry, and take the // corresponding extra dentry reference. - child.endpoint = opts.Endpoint + child.inode.endpoint = opts.Endpoint child.IncRef() return child, nil } @@ -458,13 +453,13 @@ func (d *lisafsDentry) openCreate(ctx context.Context, name string, flags uint32 } h := handle{ - fdLisa: d.fs.client.NewFD(openFD), + fdLisa: d.inode.fs.client.NewFD(openFD), fd: int32(hostFD), } if !createDentry { return nil, h, nil } - child, err := d.fs.newLisafsDentry(ctx, &ino) + child, err := d.inode.fs.newLisafsDentry(ctx, &ino) if err != nil { h.close(ctx) return nil, noHandle, err @@ -535,7 +530,7 @@ func (d *lisafsDentry) statfs(ctx context.Context) (linux.Statfs, error) { } func (d *lisafsDentry) restoreFile(ctx context.Context, inode *lisafs.Inode, opts *vfs.CompleteRestoreOptions) error { - d.controlFD = d.fs.client.NewFD(inode.ControlFD) + d.controlFD = d.inode.fs.client.NewFD(inode.ControlFD) // Gofers do not preserve inoKey across checkpoint/restore, so: // @@ -544,29 +539,29 @@ func (d *lisafsDentry) restoreFile(ctx context.Context, inode *lisafs.Inode, opt // checking inoKey. // // - We need to associate the new inoKey with the existing d.ino. - d.inoKey = inoKeyFromStatx(&inode.Stat) - d.fs.inoMu.Lock() - d.fs.inoByKey[d.inoKey] = d.ino - d.fs.inoMu.Unlock() + d.inode.inoKey = inoKeyFromStatx(&inode.Stat) + d.inode.fs.inoMu.Lock() + d.inode.fs.inoByKey[d.inode.inoKey] = d.inode.ino + d.inode.fs.inoMu.Unlock() // Check metadata stability before updating metadata. - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() if d.isRegularFile() { if opts.ValidateFileSizes { if inode.Stat.Mask&linux.STATX_SIZE == 0 { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d.fs, &d.dentry))} + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d.inode.fs, &d.dentry))} } - if d.size.RacyLoad() != inode.Stat.Size { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d.fs, &d.dentry), d.size.Load(), inode.Stat.Size)} + if d.inode.size.RacyLoad() != inode.Stat.Size { + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d.inode.fs, &d.dentry), d.inode.size.Load(), inode.Stat.Size)} } } if opts.ValidateFileModificationTimestamps { if inode.Stat.Mask&linux.STATX_MTIME == 0 { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d.fs, &d.dentry))} + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d.inode.fs, &d.dentry))} } - if want := dentryTimestamp(inode.Stat.Mtime); d.mtime.RacyLoad() != want { - return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d.fs, &d.dentry), linux.NsecToStatxTimestamp(d.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} + if want := dentryTimestamp(inode.Stat.Mtime); d.inode.mtime.RacyLoad() != want { + return vfs.ErrCorruption{Err: fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d.inode.fs, &d.dentry), linux.NsecToStatxTimestamp(d.inode.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))} } } } @@ -574,9 +569,9 @@ func (d *lisafsDentry) restoreFile(ctx context.Context, inode *lisafs.Inode, opt d.updateMetadataFromStatxLocked(&inode.Stat) } - if rw, ok := d.fs.savedDentryRW[&d.dentry]; ok { + if rw, ok := d.inode.fs.savedDentryRW[&d.dentry]; ok { if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil { - return fmt.Errorf("failed to restore file handles (read=%t, write=%t) for %q: %w", rw.read, rw.write, genericDebugPathname(d.fs, &d.dentry), err) + return fmt.Errorf("failed to restore file handles (read=%t, write=%t) for %q: %w", rw.read, rw.write, genericDebugPathname(d.inode.fs, &d.dentry), err) } } @@ -590,7 +585,7 @@ func (d *lisafsDentry) restoreFile(ctx context.Context, inode *lisafs.Inode, opt // - fs.renameMu must be locked. // - InteropModeShared is in effect. func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, state *revalidateState, ds **[]*dentry) error { - start := state.start.impl.(*lisafsDentry) + start := state.start.inode.impl.(*lisafsDentry) // Populate state.names. state.names = state.names[:0] // For sanity. @@ -603,11 +598,11 @@ func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, st // Lock metadata on all dentries *before* getting attributes for them. if state.refreshStart { - start.metadataMu.Lock() - defer start.metadataMu.Unlock() + start.inode.metadataMu.Lock() + defer start.inode.metadataMu.Unlock() } for _, d := range state.dentries { - d.metadataMu.Lock() + d.inode.metadataMu.Lock() } // lastUnlockedDentry keeps track of the dentries in state.dentries that have // already had their metadataMu unlocked. Avoid defer unlock in the loop @@ -617,7 +612,7 @@ func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, st // Advance to the first unevaluated dentry and unlock the remaining // dentries. for lastUnlockedDentry++; lastUnlockedDentry < len(state.dentries); lastUnlockedDentry++ { - state.dentries[lastUnlockedDentry].metadataMu.Unlock() + state.dentries[lastUnlockedDentry].inode.metadataMu.Unlock() } }() @@ -640,12 +635,12 @@ func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, st d := state.dentries[i] found := i < len(stats) // Advance lastUnlockedDentry. It is the responsibility of this for loop - // block to unlock d.metadataMu. + // block to unlock d.inode.metadataMu. lastUnlockedDentry = i // Note that synthetic dentries will always fail this comparison check. - if !found || d.inoKey != inoKeyFromStatx(&stats[i]) { - d.metadataMu.Unlock() + if !found || d.inode.inoKey != inoKeyFromStatx(&stats[i]) { + d.inode.metadataMu.Unlock() if !found && d.isSynthetic() { // We have a synthetic file, and no remote file has arisen to replace // it. @@ -658,8 +653,8 @@ func doRevalidationLisafs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, st } // The file at this path hasn't changed. Just update cached metadata. - d.impl.(*lisafsDentry).updateMetadataFromStatxLocked(&stats[i]) // +checklocksforce: see above. - d.metadataMu.Unlock() + d.inode.impl.(*lisafsDentry).updateMetadataFromStatxLocked(&stats[i]) // +checklocksforce: see above. + d.inode.metadataMu.Unlock() } return nil } diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 56d5136817..3aa3f6970e 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -51,16 +51,16 @@ type regularFileFD struct { func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) { fd := ®ularFileFD{} - fd.LockFD.Init(&d.locks) + fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { return nil, err } - if fd.vfsfd.IsWritable() && (d.mode.Load()&0111 != 0) { + if fd.vfsfd.IsWritable() && (d.inode.mode.Load()&0111 != 0) { metric.SuspiciousOperationsMetric.Increment(&metric.SuspiciousOperationsTypeOpenedWriteExecuteFile) } - if d.mmapFD.Load() >= 0 { + if d.inode.mmapFD.Load() >= 0 { fsmetric.GoferOpensHost.Increment() } else { fsmetric.GoferOpens9P.Increment() @@ -78,7 +78,7 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { return nil } d := fd.dentry() - if d.fs.opts.interop == InteropModeExclusive { + if d.inode.fs.opts.interop == InteropModeExclusive { // d may have dirty pages that we won't write back now (and wouldn't // have in VFS1), making a flushf RPC ineffective. If this is the case, // skip the flushf. @@ -87,9 +87,9 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { // modes if forcePageCache is in effect; we conservatively assume that // applications have some way of tolerating this and still want the // flushf. - d.dataMu.RLock() - haveDirtyPages := !d.dirty.IsEmpty() - d.dataMu.RUnlock() + d.inode.dataMu.RLock() + haveDirtyPages := !d.inode.dirty.IsEmpty() + d.inode.dataMu.RUnlock() if haveDirtyPages { return nil } @@ -110,7 +110,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs start := fsmetric.StartReadWait() d := fd.dentry() defer func() { - if d.readFD.Load() >= 0 { + if d.inode.readFD.Load() >= 0 { fsmetric.GoferReadsHost.Increment() fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) } else { @@ -131,8 +131,8 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs } // Check for reading at EOF before calling into MM (but not under - // InteropModeShared, which makes d.size unreliable). - if d.cachedMetadataAuthoritative() && uint64(offset) >= d.size.Load() { + // InteropModeShared, which makes d.inode.size unreliable). + if d.cachedMetadataAuthoritative() && uint64(offset) >= d.inode.size.Load() { return 0, io.EOF } @@ -151,7 +151,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs rw.direct = true n, readErr = dst.CopyOutFrom(ctx, rw) putDentryReadWriter(rw) - if d.fs.opts.interop != InteropModeShared { + if d.inode.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). d.touchAtimeLocked(fd.vfsfd.Mount()) } @@ -159,7 +159,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs rw := getDentryReadWriter(ctx, d, offset) n, readErr = dst.CopyOutFrom(ctx, rw) putDentryReadWriter(rw) - if d.fs.opts.interop != InteropModeShared { + if d.inode.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). d.touchAtime(fd.vfsfd.Mount()) } @@ -198,8 +198,8 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() // If the fd was opened with O_APPEND, make sure the file size is updated. // There is a possible race here if size is modified externally after @@ -212,8 +212,8 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off // Set offset to file size if the fd was opened with O_APPEND. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { - // Holding d.metadataMu is sufficient for reading d.size. - offset = int64(d.size.RacyLoad()) + // Holding d.metadataMu is sufficient for reading d.inode.size. + offset = int64(d.inode.size.RacyLoad()) } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { @@ -221,7 +221,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } src = src.TakeFirst64(limit) - if d.fs.opts.interop != InteropModeShared { + if d.inode.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). @@ -264,14 +264,14 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off // As with Linux, writing clears the setuid and setgid bits. if n > 0 { - oldMode := d.mode.Load() - // If setuid or setgid were set, update d.mode and propagate + oldMode := d.inode.mode.Load() + // If setuid or setgid were set, update d.inode.mode and propagate // changes to the host. if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode { if err := d.chmod(ctx, uint16(newMode)); err != nil { return 0, offset, err } - d.mode.Store(newMode) + d.inode.mode.Store(newMode) } } @@ -294,19 +294,19 @@ func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64 mr := memmap.MappableRange{pgstart, pgend} var freed []memmap.FileRange - d.dataMu.Lock() - d.cache.RemoveRangeWith(mr, func(cseg fsutil.FileRangeIterator) { + d.inode.dataMu.Lock() + d.inode.cache.RemoveRangeWith(mr, func(cseg fsutil.FileRangeIterator) { freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) }) - d.dataMu.Unlock() + d.inode.dataMu.Unlock() // Invalidate mappings of removed pages. - d.mapsMu.Lock() - d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) - d.mapsMu.Unlock() + d.inode.mapsMu.Lock() + d.inode.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.inode.mapsMu.Unlock() // Finally free pages removed from the cache. - mf := d.fs.mf + mf := d.inode.fs.mf for _, freedFR := range freed { mf.DecRef(freedFR) } @@ -361,10 +361,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) // (which prevents us from caching file contents and makes dentry.size // unreliable), or if the file was opened O_DIRECT, read directly from // readHandle() without locking dentry.dataMu. - rw.d.handleMu.RLock() - defer rw.d.handleMu.RUnlock() + rw.d.inode.handleMu.RLock() + defer rw.d.inode.handleMu.RUnlock() h := rw.d.readHandle() - if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + if (rw.d.inode.mmapFD.RacyLoad() >= 0 && !rw.d.inode.fs.opts.forcePageCache) || rw.d.inode.fs.opts.interop == InteropModeShared || rw.direct { n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off) rw.off += n return n, err @@ -372,20 +372,20 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) // Otherwise read from/through the cache. memCgID := pgalloc.MemoryCgroupIDFromContext(rw.ctx) - mf := rw.d.fs.mf + mf := rw.d.inode.fs.mf fillCache := mf.ShouldCacheEvictable() var dataMuUnlock func() if fillCache { - rw.d.dataMu.Lock() - dataMuUnlock = rw.d.dataMu.Unlock + rw.d.inode.dataMu.Lock() + dataMuUnlock = rw.d.inode.dataMu.Unlock } else { - rw.d.dataMu.RLock() - dataMuUnlock = rw.d.dataMu.RUnlock + rw.d.inode.dataMu.RLock() + dataMuUnlock = rw.d.inode.dataMu.RUnlock } defer dataMuUnlock() // Compute the range to read (limited by file size and overflow-checked). - end := rw.d.size.Load() + end := rw.d.inode.size.Load() if rw.off >= end { return 0, io.EOF } @@ -394,7 +394,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) } var done uint64 - seg, gap := rw.d.cache.Find(rw.off) + seg, gap := rw.d.inode.cache.Find(rw.off) for rw.off < end { mr := memmap.MappableRange{rw.off, end} switch { @@ -428,13 +428,13 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) End: gapEnd, } optMR := gap.Range() - _, err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size.Load(), mf, pgalloc.AllocOpts{ + _, err := rw.d.inode.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.inode.size.Load(), mf, pgalloc.AllocOpts{ Kind: usage.PageCache, MemCgID: memCgID, Mode: pgalloc.AllocateAndWritePopulate, }, h.readToBlocksAt) mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) - seg, gap = rw.d.cache.Find(rw.off) + seg, gap = rw.d.inode.cache.Find(rw.off) if !seg.Ok() { return done, err } @@ -475,16 +475,16 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro // (which prevents us from caching file contents), or if the file was // opened with O_DIRECT, write directly to dentry.writeHandle() // without locking dentry.dataMu. - rw.d.handleMu.RLock() - defer rw.d.handleMu.RUnlock() + rw.d.inode.handleMu.RLock() + defer rw.d.inode.handleMu.RUnlock() h := rw.d.writeHandle() - if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + if (rw.d.inode.mmapFD.RacyLoad() >= 0 && !rw.d.inode.fs.opts.forcePageCache) || rw.d.inode.fs.opts.interop == InteropModeShared || rw.direct { n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off) rw.off += n - rw.d.dataMu.Lock() - defer rw.d.dataMu.Unlock() - if rw.off > rw.d.size.Load() { - rw.d.size.Store(rw.off) + rw.d.inode.dataMu.Lock() + defer rw.d.inode.dataMu.Unlock() + if rw.off > rw.d.inode.size.Load() { + rw.d.inode.size.Store(rw.off) // The remote file's size will implicitly be extended to the correct // value when we write back to it. } @@ -492,9 +492,9 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro } // Otherwise write to/through the cache. - mf := rw.d.fs.mf - rw.d.dataMu.Lock() - defer rw.d.dataMu.Unlock() + mf := rw.d.inode.fs.mf + rw.d.inode.dataMu.Lock() + defer rw.d.inode.dataMu.Unlock() // Compute the range to write (overflow-checked). start := rw.off @@ -507,7 +507,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro done uint64 retErr error ) - seg, gap := rw.d.cache.Find(rw.off) + seg, gap := rw.d.inode.cache.Find(rw.off) for rw.off < end { mr := memmap.MappableRange{rw.off, end} switch { @@ -525,7 +525,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro done += n rw.off += n srcs = srcs.DropFirst64(n) - rw.d.dirty.MarkDirty(segMR) + rw.d.inode.dirty.MarkDirty(segMR) if err != nil { retErr = err goto exitLoop @@ -556,18 +556,18 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro } } exitLoop: - if rw.off > rw.d.size.Load() { - rw.d.size.Store(rw.off) + if rw.off > rw.d.inode.size.Load() { + rw.d.inode.size.Store(rw.off) // The remote file's size will implicitly be extended to the correct // value when we write back to it. } // If InteropModeWritethrough is in effect, flush written data back to the // remote filesystem. - if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { + if rw.d.inode.fs.opts.interop == InteropModeWritethrough && done != 0 { if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ Start: start, End: rw.off, - }, &rw.d.cache, &rw.d.dirty, rw.d.size.Load(), mf, h.writeFromBlocksAt); err != nil { + }, &rw.d.inode.cache, &rw.d.inode.dirty, rw.d.inode.size.Load(), mf, h.writeFromBlocksAt); err != nil { // We have no idea how many bytes were actually flushed. rw.off = start done = 0 @@ -581,13 +581,13 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { if size == 0 { return nil } - d.handleMu.RLock() - defer d.handleMu.RUnlock() + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() h := d.writeHandle() - d.dataMu.Lock() - defer d.dataMu.Unlock() + d.inode.dataMu.Lock() + defer d.inode.dataMu.Unlock() // Compute the range of valid bytes (overflow-checked). - dentrySize := d.size.Load() + dentrySize := d.inode.size.Load() if uint64(offset) >= dentrySize { return nil } @@ -598,7 +598,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { return fsutil.SyncDirty(ctx, memmap.MappableRange{ Start: uint64(offset), End: uint64(end), - }, &d.cache, &d.dirty, dentrySize, d.fs.mf, h.writeFromBlocksAt) + }, &d.inode.cache, &d.inode.dirty, dentrySize, d.inode.fs.mf, h.writeFromBlocksAt) } // Seek implements vfs.FileDescriptionImpl.Seek. @@ -627,7 +627,7 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6 return 0, err } } - size := int64(d.size.Load()) + size := int64(d.inode.size.Load()) // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous // block of data. switch whence { @@ -662,8 +662,8 @@ func (fd *regularFileFD) Sync(ctx context.Context) error { func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { d := fd.dentry() // Force sentry page caching at your own risk. - if !d.fs.opts.forcePageCache { - switch d.fs.opts.interop { + if !d.inode.fs.opts.forcePageCache { + switch d.inode.fs.opts.interop { case InteropModeExclusive: // Any mapping is fine. case InteropModeWritethrough: @@ -677,16 +677,16 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt case InteropModeShared: // All mappings require a host FD to be coherent with other // filesystem users. - if d.mmapFD.Load() < 0 { + if d.inode.mmapFD.Load() < 0 { return linuxerr.ENODEV } default: - panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) + panic(fmt.Sprintf("unknown InteropMode %v", d.inode.fs.opts.interop)) } } // After this point, d may be used as a memmap.Mappable. - d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) - opts.SentryOwnedContent = d.fs.opts.forcePageCache + d.inode.pf.hostFileMapperInitOnce.Do(d.inode.pf.hostFileMapper.Init) + opts.SentryOwnedContent = d.inode.fs.opts.forcePageCache return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) } @@ -696,47 +696,47 @@ func (fs *filesystem) mayCachePagesInMemoryFile() bool { // AddMapping implements memmap.Mappable.AddMapping. func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { - d.mapsMu.Lock() - mapped := d.mappings.AddMapping(ms, ar, offset, writable) + d.inode.mapsMu.Lock() + mapped := d.inode.mappings.AddMapping(ms, ar, offset, writable) // Do this unconditionally since whether we have a host FD can change // across save/restore. for _, r := range mapped { - d.pf.hostFileMapper.IncRefOn(r) + d.inode.pf.hostFileMapper.IncRefOn(r) } - if d.fs.mayCachePagesInMemoryFile() { + if d.inode.fs.mayCachePagesInMemoryFile() { // d.Evict() will refuse to evict memory-mapped pages, so tell the // MemoryFile to not bother trying. - mf := d.fs.mf + mf := d.inode.fs.mf for _, r := range mapped { mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) } } - d.mapsMu.Unlock() + d.inode.mapsMu.Unlock() return nil } // RemoveMapping implements memmap.Mappable.RemoveMapping. func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { - d.mapsMu.Lock() - unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) + d.inode.mapsMu.Lock() + unmapped := d.inode.mappings.RemoveMapping(ms, ar, offset, writable) for _, r := range unmapped { - d.pf.hostFileMapper.DecRefOn(r) + d.inode.pf.hostFileMapper.DecRefOn(r) } - if d.fs.mayCachePagesInMemoryFile() { + if d.inode.fs.mayCachePagesInMemoryFile() { // Pages that are no longer referenced by any application memory // mappings are now considered unused; allow MemoryFile to evict them // when necessary. - mf := d.fs.mf - d.dataMu.Lock() + mf := d.inode.fs.mf + d.inode.dataMu.Lock() for _, r := range unmapped { // Since these pages are no longer mapped, they are no longer // concurrently dirtyable by a writable memory mapping. - d.dirty.AllowClean(r) + d.inode.dirty.AllowClean(r) mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) } - d.dataMu.Unlock() + d.inode.dataMu.Unlock() } - d.mapsMu.Unlock() + d.inode.mapsMu.Unlock() } // CopyMapping implements memmap.Mappable.CopyMapping. @@ -746,17 +746,17 @@ func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, // Translate implements memmap.Mappable.Translate. func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - if d.mmapFD.RacyLoad() >= 0 && !d.fs.opts.forcePageCache { + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() + if d.inode.mmapFD.RacyLoad() >= 0 && !d.inode.fs.opts.forcePageCache { mr := optional - if d.fs.opts.limitHostFDTranslation { + if d.inode.fs.opts.limitHostFDTranslation { mr = maxFillRange(required, optional) } return []memmap.Translation{ { Source: mr, - File: &d.pf, + File: &d.inode.pf, Offset: mr.Start, Perms: hostarch.AnyAccess, }, @@ -764,12 +764,12 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab } memCgID := pgalloc.MemoryCgroupIDFromContext(ctx) - d.dataMu.Lock() - defer d.dataMu.Unlock() + d.inode.dataMu.Lock() + defer d.inode.dataMu.Unlock() - // Constrain translations to d.size (rounded up) to prevent translation to + // Constrain translations to d.inode.size (rounded up) to prevent translation to // pages that may be concurrently truncated. - pgend, _ := hostarch.PageRoundUp(d.size.Load()) + pgend, _ := hostarch.PageRoundUp(d.inode.size.Load()) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { @@ -782,9 +782,9 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab optional.End = pgend } - mf := d.fs.mf + mf := d.inode.fs.mf h := d.readHandle() - _, cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size.Load(), mf, pgalloc.AllocOpts{ + _, cerr := d.inode.cache.Fill(ctx, required, maxFillRange(required, optional), d.inode.size.Load(), mf, pgalloc.AllocOpts{ Kind: usage.PageCache, MemCgID: memCgID, Mode: pgalloc.AllocateAndWritePopulate, @@ -792,7 +792,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab var ts []memmap.Translation var translatedEnd uint64 - for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + for seg := d.inode.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { segMR := seg.Range().Intersect(optional) // TODO(jamieliu): Make Translations writable even if writability is // not required if already kept-dirty by another writable translation. @@ -800,7 +800,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab if at.Write { // From this point forward, this memory can be dirtied through the // mapping at any time. - d.dirty.KeepDirty(segMR) + d.inode.dirty.KeepDirty(segMR) perms.Write = true } ts = append(ts, memmap.Translation{ @@ -844,27 +844,27 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.inode.mapsMu.Lock() + defer d.inode.mapsMu.Unlock() + d.inode.mappings.InvalidateAll(memmap.InvalidateOpts{}) // Write the cache's contents back to the remote file so that if we have a // host fd after restore, the remote file's contents are coherent. - mf := d.fs.mf - d.handleMu.RLock() - defer d.handleMu.RUnlock() + mf := d.inode.fs.mf + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() h := d.writeHandle() - d.dataMu.Lock() - defer d.dataMu.Unlock() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { + d.inode.dataMu.Lock() + defer d.inode.dataMu.Unlock() + if err := fsutil.SyncDirtyAll(ctx, &d.inode.cache, &d.inode.dirty, d.inode.size.Load(), mf, h.writeFromBlocksAt); err != nil { return err } // Discard the cache so that it's not stored in saved state. This is safe // because per InvalidateUnsavable invariants, no new translations can have // been returned after we invalidated all existing translations above. - d.cache.DropAll(mf) - d.dirty.RemoveAll() + d.inode.cache.DropAll(mf) + d.inode.dirty.RemoveAll() return nil } @@ -872,26 +872,26 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Evict implements pgalloc.EvictableMemoryUser.Evict. func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { mr := memmap.MappableRange{er.Start, er.End} - mf := d.fs.mf - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.handleMu.RLock() - defer d.handleMu.RUnlock() + mf := d.inode.fs.mf + d.inode.mapsMu.Lock() + defer d.inode.mapsMu.Unlock() + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() h := d.writeHandle() - d.dataMu.Lock() - defer d.dataMu.Unlock() + d.inode.dataMu.Lock() + defer d.inode.dataMu.Unlock() // Only allow pages that are no longer memory-mapped to be evicted. - for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { + for mgap := d.inode.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { mgapMR := mgap.Range().Intersect(mr) if mgapMR.Length() == 0 { continue } - if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirty(ctx, mgapMR, &d.inode.cache, &d.inode.dirty, d.inode.size.Load(), mf, h.writeFromBlocksAt); err != nil { log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) } - d.cache.Drop(mgapMR, mf) - d.dirty.KeepClean(mgapMR) + d.inode.cache.Drop(mgapMR, mf) + d.inode.dirty.KeepClean(mgapMR) } } @@ -923,23 +923,23 @@ type dentryPlatformFile struct { // IncRef implements memmap.File.IncRef. func (d *dentryPlatformFile) IncRef(fr memmap.FileRange, memCgID uint32) { - d.dataMu.Lock() + d.inode.dataMu.Lock() d.fdRefs.IncRefAndAccount(fr, memCgID) - d.dataMu.Unlock() + d.inode.dataMu.Unlock() } // DecRef implements memmap.File.DecRef. func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { - d.dataMu.Lock() + d.inode.dataMu.Lock() d.fdRefs.DecRefAndAccount(fr) - d.dataMu.Unlock() + d.inode.dataMu.Unlock() } // MapInternal implements memmap.File.MapInternal. func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - return d.hostFileMapper.MapInternal(fr, int(d.mmapFD.RacyLoad()), at.Write) + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() + return d.hostFileMapper.MapInternal(fr, int(d.inode.mmapFD.RacyLoad()), at.Write) } // DataFD implements memmap.File.DataFD. @@ -949,7 +949,7 @@ func (d *dentryPlatformFile) DataFD(fr memmap.FileRange) (int, error) { // FD implements memmap.File.FD. func (d *dentryPlatformFile) FD() int { - d.handleMu.RLock() - defer d.handleMu.RUnlock() - return int(d.mmapFD.RacyLoad()) + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() + return int(d.inode.mmapFD.RacyLoad()) } diff --git a/pkg/sentry/fsimpl/gofer/revalidate.go b/pkg/sentry/fsimpl/gofer/revalidate.go index 9231d9c0fd..b06cd3138a 100644 --- a/pkg/sentry/fsimpl/gofer/revalidate.go +++ b/pkg/sentry/fsimpl/gofer/revalidate.go @@ -247,7 +247,7 @@ func (d *dentry) invalidate(ctx context.Context, vfsObj *vfs.VirtualFilesystem, // now. (The same would apply to racy replacement by // filesystem.RenameAt(), but we can't race with rename since renameMu // has been locked since entering filesystem.revalidatePath().) - if removed && (d.isSynthetic() || d.endpoint != nil) { + if removed && (d.isSynthetic() || d.inode.endpoint != nil) { d.decRefNoCaching() } diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go index 20f427573f..a2126bc97e 100644 --- a/pkg/sentry/fsimpl/gofer/save_restore.go +++ b/pkg/sentry/fsimpl/gofer/save_restore.go @@ -113,10 +113,10 @@ func (fd *specialFileFD) savePipeData(ctx context.Context) error { func (d *dentry) prepareSaveDead(ctx context.Context) error { if !d.isRegularFile() && !d.isDir() { - return fmt.Errorf("gofer.dentry(%q).prepareSaveDead: only deleted dentries for regular files and directories can be saved, got %s", genericDebugPathname(d.fs, d), linux.FileMode(d.mode.Load())) + return fmt.Errorf("gofer.dentry(%q).prepareSaveDead: only deleted dentries for regular files and directories can be saved, got %s", genericDebugPathname(d.inode.fs, d), linux.FileMode(d.inode.mode.Load())) } if !d.isDeleted() { - return fmt.Errorf("gofer.dentry(%q).prepareSaveDead: invalidated dentries can't be saved", genericDebugPathname(d.fs, d)) + return fmt.Errorf("gofer.dentry(%q).prepareSaveDead: invalidated dentries can't be saved", genericDebugPathname(d.inode.fs, d)) } if d.isRegularFile() { if !d.cachedMetadataAuthoritative() { @@ -131,15 +131,15 @@ func (d *dentry) prepareSaveDead(ctx context.Context) error { } } if d.isReadHandleOk() || d.isWriteHandleOk() { - d.fs.savedDentryRW[d] = savedDentryRW{ + d.inode.fs.savedDentryRW[d] = savedDentryRW{ read: d.isReadHandleOk(), write: d.isWriteHandleOk(), } } - if d.fs.savedDeletedOpenDentries == nil { - d.fs.savedDeletedOpenDentries = make(map[*dentry]struct{}) + if d.inode.fs.savedDeletedOpenDentries == nil { + d.inode.fs.savedDeletedOpenDentries = make(map[*dentry]struct{}) } - d.fs.savedDeletedOpenDentries[d] = struct{}{} + d.inode.fs.savedDeletedOpenDentries[d] = struct{}{} return nil } @@ -148,8 +148,8 @@ func (d *dentry) prepareSaveDead(ctx context.Context) error { // - d.isDeleted() func (d *dentry) prepareSaveDeletedRegularFile(ctx context.Context) error { // Fetch an appropriate handle to read the deleted file. - d.handleMu.RLock() - defer d.handleMu.RUnlock() + d.inode.handleMu.RLock() + defer d.inode.handleMu.RUnlock() var h handle if d.isReadHandleOk() { h = d.readHandle() @@ -157,27 +157,27 @@ func (d *dentry) prepareSaveDeletedRegularFile(ctx context.Context) error { var err error h, err = d.openHandle(ctx, true /* read */, false /* write */, false /* trunc */) if err != nil { - return fmt.Errorf("failed to open read handle for deleted file %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to open read handle for deleted file %q: %w", genericDebugPathname(d.inode.fs, d), err) } defer h.close(ctx) } - // Read the file data and store it in d.savedDeletedData. - d.dataMu.RLock() - defer d.dataMu.RUnlock() - d.savedDeletedData = make([]byte, d.size.Load()) + // Read the file data and store it in d.inode.savedDeletedData. + d.inode.dataMu.RLock() + defer d.inode.dataMu.RUnlock() + d.inode.savedDeletedData = make([]byte, d.inode.size.Load()) done := uint64(0) - for done < uint64(len(d.savedDeletedData)) { - n, err := h.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(d.savedDeletedData[done:])), done) + for done < uint64(len(d.inode.savedDeletedData)) { + n, err := h.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(d.inode.savedDeletedData[done:])), done) done += n if err != nil { if err == io.EOF { break } - return fmt.Errorf("failed to read deleted file %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to read deleted file %q: %w", genericDebugPathname(d.inode.fs, d), err) } } - if done < uint64(len(d.savedDeletedData)) { - return fmt.Errorf("failed to read all of deleted file %q: read %d bytes, expected %d", genericDebugPathname(d.fs, d), done, len(d.savedDeletedData)) + if done < uint64(len(d.inode.savedDeletedData)) { + return fmt.Errorf("failed to read all of deleted file %q: read %d bytes, expected %d", genericDebugPathname(d.inode.fs, d), done, len(d.inode.savedDeletedData)) } return nil } @@ -191,7 +191,7 @@ func (d *dentry) prepareSaveRecursive(ctx context.Context) error { } } if d.isReadHandleOk() || d.isWriteHandleOk() { - d.fs.savedDentryRW[d] = savedDentryRW{ + d.inode.fs.savedDentryRW[d] = savedDentryRW{ read: d.isReadHandleOk(), write: d.isWriteHandleOk(), } @@ -216,8 +216,8 @@ func (d *dentry) prepareSaveRecursive(ctx context.Context) error { // beforeSave is invoked by stateify. func (d *dentry) beforeSave() { if d.vfsd.IsDead() { - if _, ok := d.fs.savedDeletedOpenDentries[d]; !ok { - panic(fmt.Sprintf("gofer.dentry(%q).beforeSave: dead dentry is not saved in fs.savedDeletedOpenDentries (deleted=%t, synthetic=%t)", genericDebugPathname(d.fs, d), d.isDeleted(), d.isSynthetic())) + if _, ok := d.inode.fs.savedDeletedOpenDentries[d]; !ok { + panic(fmt.Sprintf("gofer.dentry(%q).beforeSave: dead dentry is not saved in fs.savedDeletedOpenDentries (deleted=%t, synthetic=%t)", genericDebugPathname(d.inode.fs, d), d.isDeleted(), d.isSynthetic())) } } } @@ -225,7 +225,7 @@ func (d *dentry) beforeSave() { // BeforeResume implements vfs.FilesystemImplSaveRestoreExtension.BeforeResume. func (fs *filesystem) BeforeResume(ctx context.Context) { for d := range fs.savedDeletedOpenDentries { - d.savedDeletedData = nil + d.inode.savedDeletedData = nil } fs.savedDeletedOpenDentries = nil fs.savedDentryRW = nil @@ -238,9 +238,9 @@ func (fs *filesystem) afterLoad(ctx goContext.Context) { // afterLoad is invoked by stateify. func (d *dentry) afterLoad(goContext.Context) { - d.readFD = atomicbitops.FromInt32(-1) - d.writeFD = atomicbitops.FromInt32(-1) - d.mmapFD = atomicbitops.FromInt32(-1) + d.inode.readFD = atomicbitops.FromInt32(-1) + d.inode.writeFD = atomicbitops.FromInt32(-1) + d.inode.mmapFD = atomicbitops.FromInt32(-1) if d.refs.Load() != -1 { refs.Register(d) } @@ -383,10 +383,10 @@ func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.Comp // // Preconditions: // - d.isRegularFile() || d.isDir() -// - d.savedDeletedData != nil iff d.isRegularFile() +// - d.inode.savedDeletedData != nil iff d.isRegularFile() func (d *dentry) restoreDeleted(ctx context.Context, opts *vfs.CompleteRestoreOptions, dirsToDelete map[*dentry]struct{}) error { parent := d.parent.Load() - if _, ok := d.fs.savedDeletedOpenDentries[parent]; ok { + if _, ok := d.inode.fs.savedDeletedOpenDentries[parent]; ok { // Recursively restore the parent first if the parent is also deleted. if err := parent.restoreDeleted(ctx, opts, dirsToDelete); err != nil { return err @@ -398,55 +398,55 @@ func (d *dentry) restoreDeleted(ctx context.Context, opts *vfs.CompleteRestoreOp case d.isDir(): return d.restoreDeletedDirectory(ctx, opts, dirsToDelete) default: - return fmt.Errorf("gofer.dentry(%q).restoreDeleted: invalid file type %s", genericDebugPathname(d.fs, d), linux.FileMode(d.mode.Load())) + return fmt.Errorf("gofer.dentry(%q).restoreDeleted: invalid file type %s", genericDebugPathname(d.inode.fs, d), linux.FileMode(d.inode.mode.Load())) } } func (d *dentry) restoreDeletedDirectory(ctx context.Context, opts *vfs.CompleteRestoreOptions, dirsToDelete map[*dentry]struct{}) error { // Recreate the directory on the host filesystem. This will be deleted later. parent := d.parent.Load() - _, err := parent.mkdir(ctx, d.name, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load()), false /* createDentry */) + _, err := parent.mkdir(ctx, d.name, linux.FileMode(d.inode.mode.Load()), auth.KUID(d.inode.uid.Load()), auth.KGID(d.inode.gid.Load()), false /* createDentry */) if err != nil { - return fmt.Errorf("failed to re-create deleted directory %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to re-create deleted directory %q: %w", genericDebugPathname(d.inode.fs, d), err) } // Restore the directory. if err := d.restoreFile(ctx, opts); err != nil { if err := parent.unlink(ctx, d.name, linux.AT_REMOVEDIR); err != nil { - log.Warningf("failed to clean up recreated deleted directory %q: %v", genericDebugPathname(d.fs, d), err) + log.Warningf("failed to clean up recreated deleted directory %q: %v", genericDebugPathname(d.inode.fs, d), err) } return fmt.Errorf("failed to restore deleted directory: %w", err) } // We will delete the directory later. We need to keep it around in case any // of its children need to be restored after this. dirsToDelete[d] = struct{}{} - delete(d.fs.savedDeletedOpenDentries, d) + delete(d.inode.fs.savedDeletedOpenDentries, d) return nil } func (d *dentry) restoreDeletedRegularFile(ctx context.Context, opts *vfs.CompleteRestoreOptions) error { // Recreate the file on the host filesystem (this is temporary). parent := d.parent.Load() - _, h, err := parent.openCreate(ctx, d.name, linux.O_WRONLY, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load()), false /* createDentry */) + _, h, err := parent.openCreate(ctx, d.name, linux.O_WRONLY, linux.FileMode(d.inode.mode.Load()), auth.KUID(d.inode.uid.Load()), auth.KGID(d.inode.gid.Load()), false /* createDentry */) if err != nil { - return fmt.Errorf("failed to re-create deleted file %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to re-create deleted file %q: %w", genericDebugPathname(d.inode.fs, d), err) } defer h.close(ctx) // In case of errors, clean up the recreated file. unlinkCU := cleanup.Make(func() { if err := parent.unlink(ctx, d.name, 0 /* flags */); err != nil { - log.Warningf("failed to clean up recreated deleted file %q: %v", genericDebugPathname(d.fs, d), err) + log.Warningf("failed to clean up recreated deleted file %q: %v", genericDebugPathname(d.inode.fs, d), err) } }) defer unlinkCU.Clean() // Write the file data to the recreated file. - n, err := h.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(d.savedDeletedData)), 0) + n, err := h.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(d.inode.savedDeletedData)), 0) if err != nil { - return fmt.Errorf("failed to write deleted file %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to write deleted file %q: %w", genericDebugPathname(d.inode.fs, d), err) } - if n != uint64(len(d.savedDeletedData)) { - return fmt.Errorf("failed to write all of deleted file %q: wrote %d bytes, expected %d", genericDebugPathname(d.fs, d), n, len(d.savedDeletedData)) + if n != uint64(len(d.inode.savedDeletedData)) { + return fmt.Errorf("failed to write all of deleted file %q: wrote %d bytes, expected %d", genericDebugPathname(d.inode.fs, d), n, len(d.inode.savedDeletedData)) } - d.savedDeletedData = nil + d.inode.savedDeletedData = nil // Restore the file. Note that timestamps may not match since we re-created // the file on the host. recreateOpts := *opts @@ -457,9 +457,9 @@ func (d *dentry) restoreDeletedRegularFile(ctx context.Context, opts *vfs.Comple // Finally, unlink the recreated file. unlinkCU.Release() if err := parent.unlink(ctx, d.name, 0 /* flags */); err != nil { - return fmt.Errorf("failed to clean up recreated deleted file %q: %v", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to clean up recreated deleted file %q: %v", genericDebugPathname(d.inode.fs, d), err) } - delete(d.fs.savedDeletedOpenDentries, d) + delete(d.inode.fs.savedDeletedOpenDentries, d) return nil } @@ -467,7 +467,7 @@ func (fd *specialFileFD) completeRestore(ctx context.Context) error { d := fd.dentry() h, err := d.openHandle(ctx, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */) if err != nil { - return fmt.Errorf("failed to open handle for specialFileFD for %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to open handle for specialFileFD for %q: %w", genericDebugPathname(d.inode.fs, d), err) } fd.handle = h @@ -475,7 +475,7 @@ func (fd *specialFileFD) completeRestore(ctx context.Context) error { fd.haveQueue = (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && fd.handle.fd >= 0 if fd.haveQueue { if err := fdnotifier.AddFD(fd.handle.fd, &fd.queue); err != nil { - return fmt.Errorf("failed to add FD to fdnotified for %q: %w", genericDebugPathname(d.fs, d), err) + return fmt.Errorf("failed to add FD to fdnotified for %q: %w", genericDebugPathname(d.inode.fs, d), err) } } diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index 64e9a995a6..f0fba13e8f 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -103,9 +103,9 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context, opts transport.Uni } func (e *endpoint) newConnectedEndpoint(ctx context.Context, sockType linux.SockType, queue *waiter.Queue, opts transport.UnixSocketOpts) (*transport.SCMConnectedEndpoint, *syserr.Error) { - e.dentry.fs.renameMu.RLock() + e.dentry.inode.fs.renameMu.RLock() hostSockFD, err := e.dentry.connect(ctx, sockType) - e.dentry.fs.renameMu.RUnlock() + e.dentry.inode.fs.renameMu.RUnlock() if err != nil { return nil, syserr.ErrConnectionRefused } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index d974e05f9b..f83bf7c410 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -101,7 +101,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci seekable: seekable, haveQueue: haveQueue, } - fd.LockFD.Init(&d.locks) + fd.LockFD.Init(&d.inode.locks) if haveQueue { if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { return nil, err @@ -117,10 +117,10 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci } return nil, err } - d.fs.syncMu.Lock() - d.fs.specialFileFDs.PushBack(fd) - d.fs.syncMu.Unlock() - if fd.vfsfd.IsWritable() && (d.mode.Load()&0111 != 0) { + d.inode.fs.syncMu.Lock() + d.inode.fs.specialFileFDs.PushBack(fd) + d.inode.fs.syncMu.Unlock() + if fd.vfsfd.IsWritable() && (d.inode.mode.Load()&0111 != 0) { metric.SuspiciousOperationsMetric.Increment(&metric.SuspiciousOperationsTypeOpenedWriteExecuteFile) } if h.fd >= 0 { @@ -311,13 +311,13 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } // We need to hold the metadataMu *while* writing to a regular file. - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() // Set offset to file size if the regular file was opened with O_APPEND. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { - // Holding d.metadataMu is sufficient for reading d.size. - offset = int64(d.size.RacyLoad()) + // Holding d.inode.metadataMu is sufficient for reading d.inode.size. + offset = int64(d.inode.size.RacyLoad()) } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { @@ -335,7 +335,7 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } // handleReadWriter always writes to the remote file. So O_DIRECT is - // effectively always set. Invalidate pages in d.mappings that have been + // effectively always set. Invalidate pages in d.inode.mappings that have been // written to. pgstart := hostarch.PageRoundDown(uint64(offset)) pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) @@ -343,9 +343,9 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off return 0, offset, linuxerr.EINVAL } mr := memmap.MappableRange{pgstart, pgend} - d.mapsMu.Lock() - d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) - d.mapsMu.Unlock() + d.inode.mapsMu.Lock() + d.inode.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.inode.mapsMu.Unlock() rw := getHandleReadWriter(ctx, &fd.handle, offset) n, err := src.CopyInTo(ctx, rw) @@ -369,11 +369,11 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off } // Update file size for regular files. if fd.isRegularFile { - // d.metadataMu is already locked at this point. - if uint64(offset) > d.size.RacyLoad() { - d.dataMu.Lock() - defer d.dataMu.Unlock() - d.size.Store(uint64(offset)) + // d.inode.metadataMu is already locked at this point. + if uint64(offset) > d.inode.size.RacyLoad() { + d.inode.dataMu.Lock() + defer d.inode.dataMu.Unlock() + d.inode.size.Store(uint64(offset)) } } return int64(n), offset, err @@ -444,9 +444,9 @@ func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt // AddMapping implements memmap.Mappable.AddMapping. func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { d := fd.dentry() - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.mappings.AddMapping(ms, ar, offset, writable) + d.inode.mapsMu.Lock() + defer d.inode.mapsMu.Unlock() + d.inode.mappings.AddMapping(ms, ar, offset, writable) fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) return nil } @@ -454,9 +454,9 @@ func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, // RemoveMapping implements memmap.Mappable.RemoveMapping. func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { d := fd.dentry() - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.mappings.RemoveMapping(ms, ar, offset, writable) + d.inode.mapsMu.Lock() + defer d.inode.mapsMu.Unlock() + d.inode.mappings.RemoveMapping(ms, ar, offset, writable) fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) } @@ -484,9 +484,9 @@ func (fd *specialFileFD) Translate(ctx context.Context, required, optional memma // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error { d := fd.dentry() - d.mapsMu.Lock() - defer d.mapsMu.Unlock() - d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.inode.mapsMu.Lock() + defer d.inode.mapsMu.Unlock() + d.inode.mappings.InvalidateAll(memmap.InvalidateOpts{}) return nil } @@ -532,7 +532,7 @@ func (fd *specialFileFD) requireHostFD() { func (fd *specialFileFD) updateMetadata(ctx context.Context) error { d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() + d.inode.metadataMu.Lock() + defer d.inode.metadataMu.Unlock() return d.updateMetadataLocked(ctx, fd.handle) } diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index c446f1629c..8be42d13e5 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -26,22 +26,22 @@ func (d *dentry) isSymlink() bool { // Precondition: d.isSymlink(). func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { - if d.fs.opts.interop != InteropModeShared { + if d.inode.fs.opts.interop != InteropModeShared { d.touchAtime(mnt) - d.dataMu.Lock() - if d.haveTarget { - target := d.target - d.dataMu.Unlock() + d.inode.dataMu.Lock() + if d.inode.haveTarget { + target := d.inode.target + d.inode.dataMu.Unlock() return target, nil } } target, err := d.readlinkImpl(ctx) - if d.fs.opts.interop != InteropModeShared { + if d.inode.fs.opts.interop != InteropModeShared { if err == nil { - d.haveTarget = true - d.target = target + d.inode.haveTarget = true + d.inode.target = target } - d.dataMu.Unlock() // +checklocksforce: guaranteed locked from above. + d.inode.dataMu.Unlock() // +checklocksforce: guaranteed locked from above. } return target, err } diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 0c58172b17..fff9baed9f 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -36,15 +36,15 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now := d.fs.clock.Now().Nanoseconds() - d.metadataMu.Lock() - d.atime.Store(now) - d.atimeDirty.Store(1) - d.metadataMu.Unlock() + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.metadataMu.Lock() + d.inode.atime.Store(now) + d.inode.atimeDirty.Store(1) + d.inode.metadataMu.Unlock() mnt.EndWrite() } -// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true. +// Preconditions: d.inode.metadataMu is locked. d.cachedMetadataAuthoritative() == true. func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) { if opts := mnt.Options(); opts.Flags.NoATime || opts.ReadOnly { return @@ -52,9 +52,9 @@ func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now := d.fs.clock.Now().Nanoseconds() - d.atime.Store(now) - d.atimeDirty.Store(1) + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.atime.Store(now) + d.inode.atimeDirty.Store(1) mnt.EndWrite() } @@ -62,30 +62,30 @@ func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) { // - d.cachedMetadataAuthoritative() == true. // - The caller has successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCtime() { - now := d.fs.clock.Now().Nanoseconds() - d.metadataMu.Lock() - d.ctime.Store(now) - d.metadataMu.Unlock() + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.metadataMu.Lock() + d.inode.ctime.Store(now) + d.inode.metadataMu.Unlock() } // Preconditions: // - d.cachedMetadataAuthoritative() == true. // - The caller has successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCMtime() { - now := d.fs.clock.Now().Nanoseconds() - d.metadataMu.Lock() - d.mtime.Store(now) - d.ctime.Store(now) - d.mtimeDirty.Store(1) - d.metadataMu.Unlock() + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.metadataMu.Lock() + d.inode.mtime.Store(now) + d.inode.ctime.Store(now) + d.inode.mtimeDirty.Store(1) + d.inode.metadataMu.Unlock() } // Preconditions: // - d.cachedMetadataAuthoritative() == true. -// - The caller has locked d.metadataMu. +// - The caller has locked d.inode.metadataMu. func (d *dentry) touchCMtimeLocked() { - now := d.fs.clock.Now().Nanoseconds() - d.mtime.Store(now) - d.ctime.Store(now) - d.mtimeDirty.Store(1) + now := d.inode.fs.clock.Now().Nanoseconds() + d.inode.mtime.Store(now) + d.inode.ctime.Store(now) + d.inode.mtimeDirty.Store(1) } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 91f3f3d143..5135a91727 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -342,11 +342,7 @@ syscall_test( syscall_test( add_fusefs = True, - add_overlay = True, test = "//test/syscalls/linux:link_test", - # TODO(gvisor.dev/issue/6739): Remove use_tmpfs=True once gofer filesystem - # supports hard links correctly. - use_tmpfs = True, ) syscall_test( diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc index 43e51667f5..46de5db752 100644 --- a/test/syscalls/linux/link.cc +++ b/test/syscalls/linux/link.cc @@ -21,12 +21,13 @@ #include +#include "gmock/gmock.h" #include "gtest/gtest.h" #include "absl/flags/flag.h" #include "absl/strings/str_cat.h" -#include "test/util/capability_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" +#include "test/util/linux_capability_util.h" #include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" @@ -51,7 +52,6 @@ bool IsSameFile(const std::string& f1, const std::string& f2) { } // TODO(b/178640646): Add test for linkat with AT_EMPTY_PATH - TEST(LinkTest, CanCreateLinkFile) { auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); const std::string newname = NewTempAbsPath(); @@ -76,6 +76,21 @@ TEST(LinkTest, CanCreateLinkFile) { IsPosixErrorOkAndHolds(initial_link_count)); } +TEST(LinkTest, HardlinkChangeMode) { + auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const std::string newname = NewTempAbsPath(); + + ASSERT_THAT(link(oldfile.path().c_str(), newname.c_str()), SyscallSucceeds()); + + EXPECT_TRUE(IsSameFile(oldfile.path(), newname)); + + EXPECT_THAT(chmod(oldfile.path().c_str(), S_IRUSR), SyscallSucceeds()); + struct stat stat1 = {.st_mode = 0}; + EXPECT_THAT(lstat(newname.c_str(), &stat1), SyscallSucceeds()); + EXPECT_EQ(S_IRUSR, (stat1.st_mode & S_IRWXU)); + EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds()); +} + TEST(LinkTest, PermissionDenied) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_FOWNER))); diff --git a/test/util/test_util.cc b/test/util/test_util.cc index a234289505..c526c88fe7 100644 --- a/test/util/test_util.cc +++ b/test/util/test_util.cc @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -182,12 +183,14 @@ PosixErrorOr> GetOpenFDs() { return ret_fds; } +PosixErrorOr Permissions(const std::string& path) { + ASSIGN_OR_RETURN_ERRNO(auto stat_result, Stat(path)); + return static_cast(stat_result.st_mode); +} + PosixErrorOr Links(const std::string& path) { - struct stat st; - if (stat(path.c_str(), &st)) { - return PosixError(errno, absl::StrCat("Failed to stat ", path)); - } - return static_cast(st.st_nlink); + ASSIGN_OR_RETURN_ERRNO(auto stat_result, Stat(path)); + return static_cast(stat_result.st_nlink); } void RandomizeBuffer(char* buffer, size_t len) {